diff options
Diffstat (limited to 'fs/xfs')
231 files changed, 73900 insertions, 76286 deletions
diff --git a/fs/xfs/Kbuild b/fs/xfs/Kbuild deleted file mode 100644 index 2566e96706f..00000000000 --- a/fs/xfs/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# -# The xfs people like to share Makefile with 2.6 and 2.4. -# Utilise file named Kbuild file which has precedence over Makefile. -# - -include $(srctree)/$(obj)/Makefile-linux-2.6 diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 26b364c9d62..399e8cec6e6 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -1,5 +1,8 @@ config XFS_FS tristate "XFS filesystem support" + depends on BLOCK + select EXPORTFS + select LIBCRC32C help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can @@ -20,6 +23,7 @@ config XFS_FS config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS + select QUOTACTL help If you say Y here, you will be able to set limits for disk usage on a per user and/or a per group basis under XFS. XFS considers quota @@ -34,21 +38,10 @@ config XFS_QUOTA with or without the generic quota support enabled (CONFIG_QUOTA) - they are completely independent subsystems. -config XFS_SECURITY - bool "XFS Security Label support" - depends on XFS_FS - help - Security labels support alternative access control models - implemented by security modules like SELinux. This option - enables an extended attribute namespace for inode security - labels in the XFS filesystem. - - If you are not using a security module that requires using - extended attributes for inode security labels, say N. - config XFS_POSIX_ACL bool "XFS POSIX ACL support" depends on XFS_FS + select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. @@ -75,3 +68,29 @@ config XFS_RT See the xfs man page in section 5 for additional information. If unsure, say N. + +config XFS_WARN + bool "XFS Verbose Warnings" + depends on XFS_FS && !XFS_DEBUG + help + Say Y here to get an XFS build with many additional warnings. + It converts ASSERT checks to WARN, so will log any out-of-bounds + conditions that occur that would otherwise be missed. It is much + lighter weight than XFS_DEBUG and does not modify algorithms and will + not cause the kernel to panic on non-fatal errors. + + However, similar to XFS_DEBUG, it is only advisable to use this if you + are debugging a particular problem. + +config XFS_DEBUG + bool "XFS Debugging support" + depends on XFS_FS + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 49e3e7e5e3d..c21f4350666 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -1 +1,116 @@ -include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL) +# +# Copyright (c) 2000-2005 Silicon Graphics, Inc. +# All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# + +ccflags-y += -I$(src) # needed for trace events + +ccflags-$(CONFIG_XFS_DEBUG) += -g + +obj-$(CONFIG_XFS_FS) += xfs.o + +# this one should be compiled first, as the tracing macros can easily blow up +xfs-y += xfs_trace.o + +# highlevel code +xfs-y += xfs_aops.o \ + xfs_attr_inactive.o \ + xfs_attr_list.o \ + xfs_bit.o \ + xfs_bmap_util.o \ + xfs_buf.o \ + xfs_dir2_readdir.o \ + xfs_discard.o \ + xfs_error.o \ + xfs_export.o \ + xfs_extent_busy.o \ + xfs_file.o \ + xfs_filestream.o \ + xfs_fsops.o \ + xfs_globals.o \ + xfs_icache.o \ + xfs_ioctl.o \ + xfs_iomap.o \ + xfs_iops.o \ + xfs_itable.o \ + xfs_message.o \ + xfs_mount.o \ + xfs_mru_cache.o \ + xfs_super.o \ + xfs_symlink.o \ + xfs_trans.o \ + xfs_xattr.o \ + kmem.o \ + uuid.o + +# code shared with libxfs +xfs-y += xfs_alloc.o \ + xfs_alloc_btree.o \ + xfs_attr.o \ + xfs_attr_leaf.o \ + xfs_attr_remote.o \ + xfs_bmap.o \ + xfs_bmap_btree.o \ + xfs_btree.o \ + xfs_da_btree.o \ + xfs_da_format.o \ + xfs_dir2.o \ + xfs_dir2_block.o \ + xfs_dir2_data.o \ + xfs_dir2_leaf.o \ + xfs_dir2_node.o \ + xfs_dir2_sf.o \ + xfs_dquot_buf.o \ + xfs_ialloc.o \ + xfs_ialloc_btree.o \ + xfs_icreate_item.o \ + xfs_inode.o \ + xfs_inode_fork.o \ + xfs_inode_buf.o \ + xfs_log_recover.o \ + xfs_log_rlimit.o \ + xfs_sb.o \ + xfs_symlink_remote.o \ + xfs_trans_resv.o + +# low-level transaction/log code +xfs-y += xfs_log.o \ + xfs_log_cil.o \ + xfs_buf_item.o \ + xfs_extfree_item.o \ + xfs_inode_item.o \ + xfs_trans_ail.o \ + xfs_trans_buf.o \ + xfs_trans_extfree.o \ + xfs_trans_inode.o \ + +# optional features +xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ + xfs_dquot_item.o \ + xfs_trans_dquot.o \ + xfs_qm_syscalls.o \ + xfs_qm_bhv.o \ + xfs_qm.o \ + xfs_quotaops.o + +# xfs_rtbitmap is shared with libxfs +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_rtbitmap.o + +xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o +xfs-$(CONFIG_PROC_FS) += xfs_stats.o +xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o +xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6 deleted file mode 100644 index 9e7f85986d0..00000000000 --- a/fs/xfs/Makefile-linux-2.6 +++ /dev/null @@ -1,135 +0,0 @@ -# -# Copyright (c) 2000-2005 Silicon Graphics, Inc. -# All Rights Reserved. -# -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation. -# -# This program is distributed in the hope that it would be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write the Free Software Foundation, -# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# - -EXTRA_CFLAGS += -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char - -XFS_LINUX := linux-2.6 - -ifeq ($(CONFIG_XFS_DEBUG),y) - EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG - EXTRA_CFLAGS += -DXFS_BUF_LOCK_TRACKING -endif -ifeq ($(CONFIG_XFS_TRACE),y) - EXTRA_CFLAGS += -DXFS_ALLOC_TRACE - EXTRA_CFLAGS += -DXFS_ATTR_TRACE - EXTRA_CFLAGS += -DXFS_BLI_TRACE - EXTRA_CFLAGS += -DXFS_BMAP_TRACE - EXTRA_CFLAGS += -DXFS_BMBT_TRACE - EXTRA_CFLAGS += -DXFS_DIR_TRACE - EXTRA_CFLAGS += -DXFS_DIR2_TRACE - EXTRA_CFLAGS += -DXFS_DQUOT_TRACE - EXTRA_CFLAGS += -DXFS_ILOCK_TRACE - EXTRA_CFLAGS += -DXFS_LOG_TRACE - EXTRA_CFLAGS += -DXFS_RW_TRACE - EXTRA_CFLAGS += -DXFS_BUF_TRACE - EXTRA_CFLAGS += -DXFS_VNODE_TRACE -endif - -obj-$(CONFIG_XFS_FS) += xfs.o - -xfs-$(CONFIG_XFS_QUOTA) += $(addprefix quota/, \ - xfs_dquot.o \ - xfs_dquot_item.o \ - xfs_trans_dquot.o \ - xfs_qm_syscalls.o \ - xfs_qm_bhv.o \ - xfs_qm.o) - -ifeq ($(CONFIG_XFS_QUOTA),y) -xfs-$(CONFIG_PROC_FS) += quota/xfs_qm_stats.o -endif - -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o -xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o -xfs-$(CONFIG_PROC_FS) += $(XFS_LINUX)/xfs_stats.o -xfs-$(CONFIG_SYSCTL) += $(XFS_LINUX)/xfs_sysctl.o -xfs-$(CONFIG_COMPAT) += $(XFS_LINUX)/xfs_ioctl32.o - - -xfs-y += xfs_alloc.o \ - xfs_alloc_btree.o \ - xfs_attr.o \ - xfs_attr_leaf.o \ - xfs_behavior.o \ - xfs_bit.o \ - xfs_bmap.o \ - xfs_bmap_btree.o \ - xfs_btree.o \ - xfs_buf_item.o \ - xfs_da_btree.o \ - xfs_dir2.o \ - xfs_dir2_block.o \ - xfs_dir2_data.o \ - xfs_dir2_leaf.o \ - xfs_dir2_node.o \ - xfs_dir2_sf.o \ - xfs_error.o \ - xfs_extfree_item.o \ - xfs_fsops.o \ - xfs_ialloc.o \ - xfs_ialloc_btree.o \ - xfs_iget.o \ - xfs_inode.o \ - xfs_inode_item.o \ - xfs_iocore.o \ - xfs_iomap.o \ - xfs_itable.o \ - xfs_dfrag.o \ - xfs_log.o \ - xfs_log_recover.o \ - xfs_mount.o \ - xfs_rename.o \ - xfs_trans.o \ - xfs_trans_ail.o \ - xfs_trans_buf.o \ - xfs_trans_extfree.o \ - xfs_trans_inode.o \ - xfs_trans_item.o \ - xfs_utils.o \ - xfs_vfsops.o \ - xfs_vnodeops.o \ - xfs_rw.o \ - xfs_dmops.o \ - xfs_qmops.o - -xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o - -# Objects in linux/ -xfs-y += $(addprefix $(XFS_LINUX)/, \ - kmem.o \ - xfs_aops.o \ - xfs_buf.o \ - xfs_export.o \ - xfs_file.o \ - xfs_fs_subr.o \ - xfs_globals.o \ - xfs_ioctl.o \ - xfs_iops.o \ - xfs_lrw.o \ - xfs_super.o \ - xfs_vfs.o \ - xfs_vnode.o) - -# Objects in support/ -xfs-y += $(addprefix support/, \ - debug.o \ - move.o \ - uuid.o) - -xfs-$(CONFIG_XFS_TRACE) += support/ktrace.o - diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c new file mode 100644 index 00000000000..844e288b957 --- /dev/null +++ b/fs/xfs/kmem.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include "time.h" +#include "kmem.h" +#include "xfs_message.h" + +/* + * Greedy allocation. May fail and may return vmalloced memory. + */ +void * +kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) +{ + void *ptr; + size_t kmsize = maxsize; + + while (!(ptr = vzalloc(kmsize))) { + if ((kmsize >>= 1) <= minsize) + kmsize = minsize; + } + if (ptr) + *size = kmsize; + return ptr; +} + +void * +kmem_alloc(size_t size, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmalloc(size, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} + +void * +kmem_zalloc_large(size_t size, xfs_km_flags_t flags) +{ + unsigned noio_flag = 0; + void *ptr; + gfp_t lflags; + + ptr = kmem_zalloc(size, flags | KM_MAYFAIL); + if (ptr) + return ptr; + + /* + * __vmalloc() will allocate data pages and auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context + * here. Hence we need to tell memory reclaim that we are in such a + * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * the filesystem here and potentially deadlocking. + */ + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + noio_flag = memalloc_noio_save(); + + lflags = kmem_flags_convert(flags); + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + + if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + memalloc_noio_restore(noio_flag); + + return ptr; +} + +void +kmem_free(const void *ptr) +{ + if (!is_vmalloc_addr(ptr)) { + kfree(ptr); + } else { + vfree(ptr); + } +} + +void * +kmem_realloc(const void *ptr, size_t newsize, size_t oldsize, + xfs_km_flags_t flags) +{ + void *new; + + new = kmem_alloc(newsize, flags); + if (ptr) { + if (new) + memcpy(new, ptr, + ((oldsize < newsize) ? oldsize : newsize)); + kmem_free(ptr); + } + return new; +} + +void * +kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) +{ + int retries = 0; + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + do { + ptr = kmem_cache_alloc(zone, lflags); + if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) + return ptr; + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, lflags); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } while (1); +} diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/kmem.h index 0e8293c5a32..64db0e53ede 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/kmem.h @@ -21,15 +21,18 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/vmalloc.h> /* * General memory allocation interfaces */ -#define KM_SLEEP 0x0001u -#define KM_NOSLEEP 0x0002u -#define KM_NOFS 0x0004u -#define KM_MAYFAIL 0x0008u +typedef unsigned __bitwise xfs_km_flags_t; +#define KM_SLEEP ((__force xfs_km_flags_t)0x0001u) +#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u) +#define KM_NOFS ((__force xfs_km_flags_t)0x0004u) +#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u) +#define KM_ZERO ((__force xfs_km_flags_t)0x0010u) /* * We use a special process flag to avoid recursive callbacks into @@ -37,11 +40,11 @@ * warnings, so we explicitly skip any generic ones (silly of us). */ static inline gfp_t -kmem_flags_convert(unsigned int __nocast flags) +kmem_flags_convert(xfs_km_flags_t flags) { gfp_t lflags; - BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO)); if (flags & KM_NOSLEEP) { lflags = GFP_ATOMIC | __GFP_NOWARN; @@ -50,13 +53,26 @@ kmem_flags_convert(unsigned int __nocast flags) if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + return lflags; } -extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); -extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void kmem_free(void *, size_t); +extern void *kmem_alloc(size_t, xfs_km_flags_t); +extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t); +extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t); +extern void kmem_free(const void *); + + +extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); + +static inline void * +kmem_zalloc(size_t size, xfs_km_flags_t flags) +{ + return kmem_alloc(size, flags | KM_ZERO); +} /* * Zone interfaces @@ -72,14 +88,14 @@ extern void kmem_free(void *, size_t); static inline kmem_zone_t * kmem_zone_init(int size, char *zone_name) { - return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); + return kmem_cache_create(zone_name, size, 0, 0, NULL); } static inline kmem_zone_t * kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, - void (*construct)(void *, kmem_zone_t *, unsigned long)) + void (*construct)(void *)) { - return kmem_cache_create(zone_name, size, 0, flags, construct, NULL); + return kmem_cache_create(zone_name, size, 0, flags, construct); } static inline void @@ -95,32 +111,12 @@ kmem_zone_destroy(kmem_zone_t *zone) kmem_cache_destroy(zone); } -extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); - -/* - * Low memory cache shrinkers - */ - -typedef struct shrinker *kmem_shaker_t; -typedef int (*kmem_shake_func_t)(int, gfp_t); - -static inline kmem_shaker_t -kmem_shake_register(kmem_shake_func_t sfunc) -{ - return set_shrinker(DEFAULT_SEEKS, sfunc); -} - -static inline void -kmem_shake_deregister(kmem_shaker_t shrinker) -{ - remove_shrinker(shrinker); -} +extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t); -static inline int -kmem_shake_allow(gfp_t gfp_mask) +static inline void * +kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags) { - return (gfp_mask & __GFP_WAIT); + return kmem_zone_alloc(zone, flags | KM_ZERO); } #endif /* __XFS_SUPPORT_KMEM_H__ */ diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c deleted file mode 100644 index aba7fcf881a..00000000000 --- a/fs/xfs/linux-2.6/kmem.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/vmalloc.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include <linux/blkdev.h> -#include "time.h" -#include "kmem.h" - -#define MAX_VMALLOCS 6 -#define MAX_SLAB_SIZE 0x20000 - -void * -kmem_alloc(size_t size, unsigned int __nocast flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - do { - if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS) - ptr = kmalloc(size, lflags); - else - ptr = __vmalloc(size, lflags, PAGE_KERNEL); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) - return ptr; - if (!(++retries % 100)) - printk(KERN_ERR "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); - blk_congestion_wait(WRITE, HZ/50); - } while (1); -} - -void * -kmem_zalloc(size_t size, unsigned int __nocast flags) -{ - void *ptr; - - ptr = kmem_alloc(size, flags); - if (ptr) - memset((char *)ptr, 0, (int)size); - return ptr; -} - -void -kmem_free(void *ptr, size_t size) -{ - if (((unsigned long)ptr < VMALLOC_START) || - ((unsigned long)ptr >= VMALLOC_END)) { - kfree(ptr); - } else { - vfree(ptr); - } -} - -void * -kmem_realloc(void *ptr, size_t newsize, size_t oldsize, - unsigned int __nocast flags) -{ - void *new; - - new = kmem_alloc(newsize, flags); - if (ptr) { - if (new) - memcpy(new, ptr, - ((oldsize < newsize) ? oldsize : newsize)); - kmem_free(ptr, oldsize); - } - return new; -} - -void * -kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) -{ - int retries = 0; - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - do { - ptr = kmem_cache_alloc(zone, lflags); - if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) - return ptr; - if (!(++retries % 100)) - printk(KERN_ERR "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, lflags); - blk_congestion_wait(WRITE, HZ/50); - } while (1); -} - -void * -kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags) -{ - void *ptr; - - ptr = kmem_zone_alloc(zone, flags); - if (ptr) - memset((char *)ptr, 0, kmem_cache_size(zone)); - return ptr; -} diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h deleted file mode 100644 index b25090094cc..00000000000 --- a/fs/xfs/linux-2.6/sema.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SEMA_H__ -#define __XFS_SUPPORT_SEMA_H__ - -#include <linux/time.h> -#include <linux/wait.h> -#include <asm/atomic.h> -#include <asm/semaphore.h> - -/* - * sema_t structure just maps to struct semaphore in Linux kernel. - */ - -typedef struct semaphore sema_t; - -#define init_sema(sp, val, c, d) sema_init(sp, val) -#define initsema(sp, val) sema_init(sp, val) -#define initnsema(sp, val, name) sema_init(sp, val) -#define psema(sp, b) down(sp) -#define vsema(sp) up(sp) -#define freesema(sema) do { } while (0) - -static inline int issemalocked(sema_t *sp) -{ - return down_trylock(sp) || (up(sp), 0); -} - -/* - * Map cpsema (try to get the sema) to down_trylock. We need to switch - * the return values since cpsema returns 1 (acquired) 0 (failed) and - * down_trylock returns the reverse 0 (acquired) 1 (failed). - */ -static inline int cpsema(sema_t *sp) -{ - return down_trylock(sp) ? 0 : 1; -} - -#endif /* __XFS_SUPPORT_SEMA_H__ */ diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h deleted file mode 100644 index 50a6191178f..00000000000 --- a/fs/xfs/linux-2.6/spin.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SPIN_H__ -#define __XFS_SUPPORT_SPIN_H__ - -#include <linux/sched.h> /* preempt needs this */ -#include <linux/spinlock.h> - -/* - * Map lock_t from IRIX to Linux spinlocks. - * - * We do not make use of lock_t from interrupt context, so we do not - * have to worry about disabling interrupts at all (unlike IRIX). - */ - -typedef spinlock_t lock_t; - -#define SPLDECL(s) unsigned long s -#ifndef DEFINE_SPINLOCK -#define DEFINE_SPINLOCK(s) spinlock_t s = SPIN_LOCK_UNLOCKED -#endif - -#define spinlock_init(lock, name) spin_lock_init(lock) -#define spinlock_destroy(lock) -#define mutex_spinlock(lock) ({ spin_lock(lock); 0; }) -#define mutex_spinunlock(lock, s) do { spin_unlock(lock); (void)s; } while (0) -#define nested_spinlock(lock) spin_lock(lock) -#define nested_spinunlock(lock) spin_unlock(lock) - -#endif /* __XFS_SUPPORT_SPIN_H__ */ diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h deleted file mode 100644 index 9a8ad481b00..00000000000 --- a/fs/xfs/linux-2.6/sv.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_SV_H__ -#define __XFS_SUPPORT_SV_H__ - -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/spinlock.h> - -/* - * Synchronisation variables. - * - * (Parameters "pri", "svf" and "rts" are not implemented) - */ - -typedef struct sv_s { - wait_queue_head_t waiters; -} sv_t; - -#define SV_FIFO 0x0 /* sv_t is FIFO type */ -#define SV_LIFO 0x2 /* sv_t is LIFO type */ -#define SV_PRIO 0x4 /* sv_t is PRIO type */ -#define SV_KEYED 0x6 /* sv_t is KEYED type */ -#define SV_DEFAULT SV_FIFO - - -static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state, - unsigned long timeout) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&sv->waiters, &wait); - __set_current_state(state); - spin_unlock(lock); - - schedule_timeout(timeout); - - remove_wait_queue(&sv->waiters, &wait); -} - -#define init_sv(sv,type,name,flag) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_init(sv,flag,name) \ - init_waitqueue_head(&(sv)->waiters) -#define sv_destroy(sv) \ - /*NOTHING*/ -#define sv_wait(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_wait_sig(sv, pri, lock, s) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT) -#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts)) -#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \ - _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts)) -#define sv_signal(sv) \ - wake_up(&(sv)->waiters) -#define sv_broadcast(sv) \ - wake_up_all(&(sv)->waiters) - -#endif /* __XFS_SUPPORT_SV_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c deleted file mode 100644 index 34dcb43a783..00000000000 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ /dev/null @@ -1,1478 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_trans.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_iomap.h" -#include <linux/mpage.h> -#include <linux/pagevec.h> -#include <linux/writeback.h> - -STATIC void -xfs_count_page_state( - struct page *page, - int *delalloc, - int *unmapped, - int *unwritten) -{ - struct buffer_head *bh, *head; - - *delalloc = *unmapped = *unwritten = 0; - - bh = head = page_buffers(page); - do { - if (buffer_uptodate(bh) && !buffer_mapped(bh)) - (*unmapped) = 1; - else if (buffer_unwritten(bh) && !buffer_delay(bh)) - clear_buffer_unwritten(bh); - else if (buffer_unwritten(bh)) - (*unwritten) = 1; - else if (buffer_delay(bh)) - (*delalloc) = 1; - } while ((bh = bh->b_this_page) != head); -} - -#if defined(XFS_RW_TRACE) -void -xfs_page_trace( - int tag, - struct inode *inode, - struct page *page, - int mask) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp = vn_from_inode(inode); - loff_t isize = i_size_read(inode); - loff_t offset = page_offset(page); - int delalloc = -1, unmapped = -1, unwritten = -1; - - if (page_has_buffers(page)) - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - - ip = xfs_vtoi(vp); - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)inode, - (void *)page, - (void *)((unsigned long)mask), - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((isize >> 32) & 0xffffffff)), - (void *)((unsigned long)(isize & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)delalloc), - (void *)((unsigned long)unmapped), - (void *)((unsigned long)unwritten), - (void *)((unsigned long)current_pid()), - (void *)NULL); -} -#else -#define xfs_page_trace(tag, inode, page, mask) -#endif - -/* - * Schedule IO completion handling on a xfsdatad if this was - * the final hold on this ioend. - */ -STATIC void -xfs_finish_ioend( - xfs_ioend_t *ioend) -{ - if (atomic_dec_and_test(&ioend->io_remaining)) - queue_work(xfsdatad_workqueue, &ioend->io_work); -} - -/* - * We're now finished for good with this ioend structure. - * Update the page state via the associated buffer_heads, - * release holds on the inode and bio, and finally free - * up memory. Do not use the ioend after this. - */ -STATIC void -xfs_destroy_ioend( - xfs_ioend_t *ioend) -{ - struct buffer_head *bh, *next; - - for (bh = ioend->io_buffer_head; bh; bh = next) { - next = bh->b_private; - bh->b_end_io(bh, !ioend->io_error); - } - if (unlikely(ioend->io_error)) - vn_ioerror(ioend->io_vnode, ioend->io_error, __FILE__,__LINE__); - vn_iowake(ioend->io_vnode); - mempool_free(ioend, xfs_ioend_pool); -} - -/* - * Buffered IO write completion for delayed allocate extents. - * TODO: Update ondisk isize now that we know the file data - * has been flushed (i.e. the notorious "NULL file" problem). - */ -STATIC void -xfs_end_bio_delalloc( - void *data) -{ - xfs_ioend_t *ioend = data; - - xfs_destroy_ioend(ioend); -} - -/* - * Buffered IO write completion for regular, written extents. - */ -STATIC void -xfs_end_bio_written( - void *data) -{ - xfs_ioend_t *ioend = data; - - xfs_destroy_ioend(ioend); -} - -/* - * IO write completion for unwritten extents. - * - * Issue transactions to convert a buffer range from unwritten - * to written extents. - */ -STATIC void -xfs_end_bio_unwritten( - void *data) -{ - xfs_ioend_t *ioend = data; - bhv_vnode_t *vp = ioend->io_vnode; - xfs_off_t offset = ioend->io_offset; - size_t size = ioend->io_size; - - if (likely(!ioend->io_error)) - bhv_vop_bmap(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL); - xfs_destroy_ioend(ioend); -} - -/* - * Allocate and initialise an IO completion structure. - * We need to track unwritten extent write completion here initially. - * We'll need to extend this for updating the ondisk inode size later - * (vs. incore size). - */ -STATIC xfs_ioend_t * -xfs_alloc_ioend( - struct inode *inode, - unsigned int type) -{ - xfs_ioend_t *ioend; - - ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); - - /* - * Set the count to 1 initially, which will prevent an I/O - * completion callback from happening before we have started - * all the I/O from calling the completion routine too early. - */ - atomic_set(&ioend->io_remaining, 1); - ioend->io_error = 0; - ioend->io_list = NULL; - ioend->io_type = type; - ioend->io_vnode = vn_from_inode(inode); - ioend->io_buffer_head = NULL; - ioend->io_buffer_tail = NULL; - atomic_inc(&ioend->io_vnode->v_iocount); - ioend->io_offset = 0; - ioend->io_size = 0; - - if (type == IOMAP_UNWRITTEN) - INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); - else if (type == IOMAP_DELAY) - INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend); - else - INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend); - - return ioend; -} - -STATIC int -xfs_map_blocks( - struct inode *inode, - loff_t offset, - ssize_t count, - xfs_iomap_t *mapp, - int flags) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - int error, nmaps = 1; - - error = bhv_vop_bmap(vp, offset, count, flags, mapp, &nmaps); - if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE))) - VMODIFY(vp); - return -error; -} - -STATIC inline int -xfs_iomap_valid( - xfs_iomap_t *iomapp, - loff_t offset) -{ - return offset >= iomapp->iomap_offset && - offset < iomapp->iomap_offset + iomapp->iomap_bsize; -} - -/* - * BIO completion handler for buffered IO. - */ -STATIC int -xfs_end_bio( - struct bio *bio, - unsigned int bytes_done, - int error) -{ - xfs_ioend_t *ioend = bio->bi_private; - - if (bio->bi_size) - return 1; - - ASSERT(atomic_read(&bio->bi_cnt) >= 1); - ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; - - /* Toss bio and pass work off to an xfsdatad thread */ - bio->bi_private = NULL; - bio->bi_end_io = NULL; - bio_put(bio); - - xfs_finish_ioend(ioend); - return 0; -} - -STATIC void -xfs_submit_ioend_bio( - xfs_ioend_t *ioend, - struct bio *bio) -{ - atomic_inc(&ioend->io_remaining); - - bio->bi_private = ioend; - bio->bi_end_io = xfs_end_bio; - - submit_bio(WRITE, bio); - ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP)); - bio_put(bio); -} - -STATIC struct bio * -xfs_alloc_ioend_bio( - struct buffer_head *bh) -{ - struct bio *bio; - int nvecs = bio_get_nr_vecs(bh->b_bdev); - - do { - bio = bio_alloc(GFP_NOIO, nvecs); - nvecs >>= 1; - } while (!bio); - - ASSERT(bio->bi_private == NULL); - bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; - bio_get(bio); - return bio; -} - -STATIC void -xfs_start_buffer_writeback( - struct buffer_head *bh) -{ - ASSERT(buffer_mapped(bh)); - ASSERT(buffer_locked(bh)); - ASSERT(!buffer_delay(bh)); - ASSERT(!buffer_unwritten(bh)); - - mark_buffer_async_write(bh); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); -} - -STATIC void -xfs_start_page_writeback( - struct page *page, - struct writeback_control *wbc, - int clear_dirty, - int buffers) -{ - ASSERT(PageLocked(page)); - ASSERT(!PageWriteback(page)); - set_page_writeback(page); - if (clear_dirty) - clear_page_dirty(page); - unlock_page(page); - if (!buffers) { - end_page_writeback(page); - wbc->pages_skipped++; /* We didn't write this page */ - } -} - -static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) -{ - return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); -} - -/* - * Submit all of the bios for all of the ioends we have saved up, covering the - * initial writepage page and also any probed pages. - * - * Because we may have multiple ioends spanning a page, we need to start - * writeback on all the buffers before we submit them for I/O. If we mark the - * buffers as we got, then we can end up with a page that only has buffers - * marked async write and I/O complete on can occur before we mark the other - * buffers async write. - * - * The end result of this is that we trip a bug in end_page_writeback() because - * we call it twice for the one page as the code in end_buffer_async_write() - * assumes that all buffers on the page are started at the same time. - * - * The fix is two passes across the ioend list - one to start writeback on the - * buffer_heads, and then submit them for I/O on the second pass. - */ -STATIC void -xfs_submit_ioend( - xfs_ioend_t *ioend) -{ - xfs_ioend_t *head = ioend; - xfs_ioend_t *next; - struct buffer_head *bh; - struct bio *bio; - sector_t lastblock = 0; - - /* Pass 1 - start writeback */ - do { - next = ioend->io_list; - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { - xfs_start_buffer_writeback(bh); - } - } while ((ioend = next) != NULL); - - /* Pass 2 - submit I/O */ - ioend = head; - do { - next = ioend->io_list; - bio = NULL; - - for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { - - if (!bio) { - retry: - bio = xfs_alloc_ioend_bio(bh); - } else if (bh->b_blocknr != lastblock + 1) { - xfs_submit_ioend_bio(ioend, bio); - goto retry; - } - - if (bio_add_buffer(bio, bh) != bh->b_size) { - xfs_submit_ioend_bio(ioend, bio); - goto retry; - } - - lastblock = bh->b_blocknr; - } - if (bio) - xfs_submit_ioend_bio(ioend, bio); - xfs_finish_ioend(ioend); - } while ((ioend = next) != NULL); -} - -/* - * Cancel submission of all buffer_heads so far in this endio. - * Toss the endio too. Only ever called for the initial page - * in a writepage request, so only ever one page. - */ -STATIC void -xfs_cancel_ioend( - xfs_ioend_t *ioend) -{ - xfs_ioend_t *next; - struct buffer_head *bh, *next_bh; - - do { - next = ioend->io_list; - bh = ioend->io_buffer_head; - do { - next_bh = bh->b_private; - clear_buffer_async_write(bh); - unlock_buffer(bh); - } while ((bh = next_bh) != NULL); - - vn_iowake(ioend->io_vnode); - mempool_free(ioend, xfs_ioend_pool); - } while ((ioend = next) != NULL); -} - -/* - * Test to see if we've been building up a completion structure for - * earlier buffers -- if so, we try to append to this ioend if we - * can, otherwise we finish off any current ioend and start another. - * Return true if we've finished the given ioend. - */ -STATIC void -xfs_add_to_ioend( - struct inode *inode, - struct buffer_head *bh, - xfs_off_t offset, - unsigned int type, - xfs_ioend_t **result, - int need_ioend) -{ - xfs_ioend_t *ioend = *result; - - if (!ioend || need_ioend || type != ioend->io_type) { - xfs_ioend_t *previous = *result; - - ioend = xfs_alloc_ioend(inode, type); - ioend->io_offset = offset; - ioend->io_buffer_head = bh; - ioend->io_buffer_tail = bh; - if (previous) - previous->io_list = ioend; - *result = ioend; - } else { - ioend->io_buffer_tail->b_private = bh; - ioend->io_buffer_tail = bh; - } - - bh->b_private = NULL; - ioend->io_size += bh->b_size; -} - -STATIC void -xfs_map_buffer( - struct buffer_head *bh, - xfs_iomap_t *mp, - xfs_off_t offset, - uint block_bits) -{ - sector_t bn; - - ASSERT(mp->iomap_bn != IOMAP_DADDR_NULL); - - bn = (mp->iomap_bn >> (block_bits - BBSHIFT)) + - ((offset - mp->iomap_offset) >> block_bits); - - ASSERT(bn || (mp->iomap_flags & IOMAP_REALTIME)); - - bh->b_blocknr = bn; - set_buffer_mapped(bh); -} - -STATIC void -xfs_map_at_offset( - struct buffer_head *bh, - loff_t offset, - int block_bits, - xfs_iomap_t *iomapp) -{ - ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY)); - - lock_buffer(bh); - xfs_map_buffer(bh, iomapp, offset, block_bits); - bh->b_bdev = iomapp->iomap_target->bt_bdev; - set_buffer_mapped(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); -} - -/* - * Look for a page at index that is suitable for clustering. - */ -STATIC unsigned int -xfs_probe_page( - struct page *page, - unsigned int pg_offset, - int mapped) -{ - int ret = 0; - - if (PageWriteback(page)) - return 0; - - if (page->mapping && PageDirty(page)) { - if (page_has_buffers(page)) { - struct buffer_head *bh, *head; - - bh = head = page_buffers(page); - do { - if (!buffer_uptodate(bh)) - break; - if (mapped != buffer_mapped(bh)) - break; - ret += bh->b_size; - if (ret >= pg_offset) - break; - } while ((bh = bh->b_this_page) != head); - } else - ret = mapped ? 0 : PAGE_CACHE_SIZE; - } - - return ret; -} - -STATIC size_t -xfs_probe_cluster( - struct inode *inode, - struct page *startpage, - struct buffer_head *bh, - struct buffer_head *head, - int mapped) -{ - struct pagevec pvec; - pgoff_t tindex, tlast, tloff; - size_t total = 0; - int done = 0, i; - - /* First sum forwards in this page */ - do { - if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) - return total; - total += bh->b_size; - } while ((bh = bh->b_this_page) != head); - - /* if we reached the end of the page, sum forwards in following pages */ - tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT; - tindex = startpage->index + 1; - - /* Prune this back to avoid pathological behavior */ - tloff = min(tlast, startpage->index + 64); - - pagevec_init(&pvec, 0); - while (!done && tindex <= tloff) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; - size_t pg_offset, len = 0; - - if (tindex == tlast) { - pg_offset = - i_size_read(inode) & (PAGE_CACHE_SIZE - 1); - if (!pg_offset) { - done = 1; - break; - } - } else - pg_offset = PAGE_CACHE_SIZE; - - if (page->index == tindex && !TestSetPageLocked(page)) { - len = xfs_probe_page(page, pg_offset, mapped); - unlock_page(page); - } - - if (!len) { - done = 1; - break; - } - - total += len; - tindex++; - } - - pagevec_release(&pvec); - cond_resched(); - } - - return total; -} - -/* - * Test if a given page is suitable for writing as part of an unwritten - * or delayed allocate extent. - */ -STATIC int -xfs_is_delayed_page( - struct page *page, - unsigned int type) -{ - if (PageWriteback(page)) - return 0; - - if (page->mapping && page_has_buffers(page)) { - struct buffer_head *bh, *head; - int acceptable = 0; - - bh = head = page_buffers(page); - do { - if (buffer_unwritten(bh)) - acceptable = (type == IOMAP_UNWRITTEN); - else if (buffer_delay(bh)) - acceptable = (type == IOMAP_DELAY); - else if (buffer_dirty(bh) && buffer_mapped(bh)) - acceptable = (type == 0); - else - break; - } while ((bh = bh->b_this_page) != head); - - if (acceptable) - return 1; - } - - return 0; -} - -/* - * Allocate & map buffers for page given the extent map. Write it out. - * except for the original page of a writepage, this is called on - * delalloc/unwritten pages only, for the original page it is possible - * that the page has no mapping at all. - */ -STATIC int -xfs_convert_page( - struct inode *inode, - struct page *page, - loff_t tindex, - xfs_iomap_t *mp, - xfs_ioend_t **ioendp, - struct writeback_control *wbc, - int startio, - int all_bh) -{ - struct buffer_head *bh, *head; - xfs_off_t end_offset; - unsigned long p_offset; - unsigned int type; - int bbits = inode->i_blkbits; - int len, page_dirty; - int count = 0, done = 0, uptodate = 1; - xfs_off_t offset = page_offset(page); - - if (page->index != tindex) - goto fail; - if (TestSetPageLocked(page)) - goto fail; - if (PageWriteback(page)) - goto fail_unlock_page; - if (page->mapping != inode->i_mapping) - goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) - goto fail_unlock_page; - - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - i_size_read(inode)); - - len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; - - bh = head = page_buffers(page); - do { - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh))) { - done = 1; - continue; - } - - if (buffer_unwritten(bh) || buffer_delay(bh)) { - if (buffer_unwritten(bh)) - type = IOMAP_UNWRITTEN; - else - type = IOMAP_DELAY; - - if (!xfs_iomap_valid(mp, offset)) { - done = 1; - continue; - } - - ASSERT(!(mp->iomap_flags & IOMAP_HOLE)); - ASSERT(!(mp->iomap_flags & IOMAP_DELAY)); - - xfs_map_at_offset(bh, offset, bbits, mp); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; - count++; - } else { - type = 0; - if (buffer_mapped(bh) && all_bh && startio) { - lock_buffer(bh); - xfs_add_to_ioend(inode, bh, offset, - type, ioendp, done); - count++; - page_dirty--; - } else { - done = 1; - } - } - } while (offset += len, (bh = bh->b_this_page) != head); - - if (uptodate && bh == head) - SetPageUptodate(page); - - if (startio) { - if (count) { - struct backing_dev_info *bdi; - - bdi = inode->i_mapping->backing_dev_info; - wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } else if (wbc->nr_to_write <= 0) { - done = 1; - } - } - xfs_start_page_writeback(page, wbc, !page_dirty, count); - } - - return done; - fail_unlock_page: - unlock_page(page); - fail: - return 1; -} - -/* - * Convert & write out a cluster of pages in the same extent as defined - * by mp and following the start page. - */ -STATIC void -xfs_cluster_write( - struct inode *inode, - pgoff_t tindex, - xfs_iomap_t *iomapp, - xfs_ioend_t **ioendp, - struct writeback_control *wbc, - int startio, - int all_bh, - pgoff_t tlast) -{ - struct pagevec pvec; - int done = 0, i; - - pagevec_init(&pvec, 0); - while (!done && tindex <= tlast) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - done = xfs_convert_page(inode, pvec.pages[i], tindex++, - iomapp, ioendp, wbc, startio, all_bh); - if (done) - break; - } - - pagevec_release(&pvec); - cond_resched(); - } -} - -/* - * Calling this without startio set means we are being asked to make a dirty - * page ready for freeing it's buffers. When called with startio set then - * we are coming from writepage. - * - * When called with startio set it is important that we write the WHOLE - * page if possible. - * The bh->b_state's cannot know if any of the blocks or which block for - * that matter are dirty due to mmap writes, and therefore bh uptodate is - * only valid if the page itself isn't completely uptodate. Some layers - * may clear the page dirty flag prior to calling write page, under the - * assumption the entire page will be written out; by not writing out the - * whole page the page can be reused before all valid dirty data is - * written out. Note: in the case of a page that has been dirty'd by - * mapwrite and but partially setup by block_prepare_write the - * bh->b_states's will not agree and only ones setup by BPW/BCW will have - * valid state, thus the whole page must be written out thing. - */ - -STATIC int -xfs_page_state_convert( - struct inode *inode, - struct page *page, - struct writeback_control *wbc, - int startio, - int unmapped) /* also implies page uptodate */ -{ - struct buffer_head *bh, *head; - xfs_iomap_t iomap; - xfs_ioend_t *ioend = NULL, *iohead = NULL; - loff_t offset; - unsigned long p_offset = 0; - unsigned int type; - __uint64_t end_offset; - pgoff_t end_index, last_index, tlast; - ssize_t size, len; - int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0; - int trylock = 0; - int all_bh = unmapped; - - if (startio) { - if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) - trylock |= BMAPI_TRYLOCK; - } - - /* Is this page beyond the end of the file? */ - offset = i_size_read(inode); - end_index = offset >> PAGE_CACHE_SHIFT; - last_index = (offset - 1) >> PAGE_CACHE_SHIFT; - if (page->index >= end_index) { - if ((page->index >= end_index + 1) || - !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) { - if (startio) - unlock_page(page); - return 0; - } - } - - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset); - len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; - - bh = head = page_buffers(page); - offset = page_offset(page); - flags = -1; - type = 0; - - /* TODO: cleanup count and page_dirty */ - - do { - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) { - /* - * the iomap is actually still valid, but the ioend - * isn't. shouldn't happen too often. - */ - iomap_valid = 0; - continue; - } - - if (iomap_valid) - iomap_valid = xfs_iomap_valid(&iomap, offset); - - /* - * First case, map an unwritten extent and prepare for - * extent state conversion transaction on completion. - * - * Second case, allocate space for a delalloc buffer. - * We can return EAGAIN here in the release page case. - * - * Third case, an unmapped buffer was found, and we are - * in a path where we need to write the whole page out. - */ - if (buffer_unwritten(bh) || buffer_delay(bh) || - ((buffer_uptodate(bh) || PageUptodate(page)) && - !buffer_mapped(bh) && (unmapped || startio))) { - /* - * Make sure we don't use a read-only iomap - */ - if (flags == BMAPI_READ) - iomap_valid = 0; - - if (buffer_unwritten(bh)) { - type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE | BMAPI_IGNSTATE; - } else if (buffer_delay(bh)) { - type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE | trylock; - } else { - type = IOMAP_NEW; - flags = BMAPI_WRITE | BMAPI_MMAP; - } - - if (!iomap_valid) { - if (type == IOMAP_NEW) { - size = xfs_probe_cluster(inode, - page, bh, head, 0); - } else { - size = len; - } - - err = xfs_map_blocks(inode, offset, size, - &iomap, flags); - if (err) - goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); - } - if (iomap_valid) { - xfs_map_at_offset(bh, offset, - inode->i_blkbits, &iomap); - if (startio) { - xfs_add_to_ioend(inode, bh, offset, - type, &ioend, - !iomap_valid); - } else { - set_buffer_dirty(bh); - unlock_buffer(bh); - mark_buffer_dirty(bh); - } - page_dirty--; - count++; - } - } else if (buffer_uptodate(bh) && startio) { - /* - * we got here because the buffer is already mapped. - * That means it must already have extents allocated - * underneath it. Map the extent by reading it. - */ - if (!iomap_valid || type != 0) { - flags = BMAPI_READ; - size = xfs_probe_cluster(inode, page, bh, - head, 1); - err = xfs_map_blocks(inode, offset, size, - &iomap, flags); - if (err) - goto error; - iomap_valid = xfs_iomap_valid(&iomap, offset); - } - - type = 0; - if (!test_and_set_bit(BH_Lock, &bh->b_state)) { - ASSERT(buffer_mapped(bh)); - if (iomap_valid) - all_bh = 1; - xfs_add_to_ioend(inode, bh, offset, type, - &ioend, !iomap_valid); - page_dirty--; - count++; - } else { - iomap_valid = 0; - } - } else if ((buffer_uptodate(bh) || PageUptodate(page)) && - (unmapped || startio)) { - iomap_valid = 0; - } - - if (!iohead) - iohead = ioend; - - } while (offset += len, ((bh = bh->b_this_page) != head)); - - if (uptodate && bh == head) - SetPageUptodate(page); - - if (startio) - xfs_start_page_writeback(page, wbc, 1, count); - - if (ioend && iomap_valid) { - offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >> - PAGE_CACHE_SHIFT; - tlast = min_t(pgoff_t, offset, last_index); - xfs_cluster_write(inode, page->index + 1, &iomap, &ioend, - wbc, startio, all_bh, tlast); - } - - if (iohead) - xfs_submit_ioend(iohead); - - return page_dirty; - -error: - if (iohead) - xfs_cancel_ioend(iohead); - - /* - * If it's delalloc and we have nowhere to put it, - * throw it away, unless the lower layers told - * us to try again. - */ - if (err != -EAGAIN) { - if (!unmapped) - block_invalidatepage(page, 0); - ClearPageUptodate(page); - } - return err; -} - -/* - * writepage: Called from one of two places: - * - * 1. we are flushing a delalloc buffer head. - * - * 2. we are writing out a dirty page. Typically the page dirty - * state is cleared before we get here. In this case is it - * conceivable we have no buffer heads. - * - * For delalloc space on the page we need to allocate space and - * flush it. For unmapped buffer heads on the page we should - * allocate space if the page is uptodate. For any other dirty - * buffer heads on the page we should flush them. - * - * If we detect that a transaction would be required to flush - * the page, we have to check the process flags first, if we - * are already in a transaction or disk I/O during allocations - * is off, we need to fail the writepage and redirty the page. - */ - -STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - int error; - int need_trans; - int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; - - xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0); - - /* - * We need a transaction if: - * 1. There are delalloc buffers on the page - * 2. The page is uptodate and we have unmapped buffers - * 3. The page is uptodate and we have no buffers - * 4. There are unwritten buffers on the page - */ - - if (!page_has_buffers(page)) { - unmapped = 1; - need_trans = 1; - } else { - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!PageUptodate(page)) - unmapped = 0; - need_trans = delalloc + unmapped + unwritten; - } - - /* - * If we need a transaction and the process flags say - * we are already in a transaction, or no IO is allowed - * then mark the page dirty again and leave the page - * as is. - */ - if (current_test_flags(PF_FSTRANS) && need_trans) - goto out_fail; - - /* - * Delay hooking up buffer heads until we have - * made our go/no-go decision. - */ - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - - /* - * Convert delayed allocate, unwritten or unmapped space - * to real space and flush out to disk. - */ - error = xfs_page_state_convert(inode, page, wbc, 1, unmapped); - if (error == -EAGAIN) - goto out_fail; - if (unlikely(error < 0)) - goto out_unlock; - - return 0; - -out_fail: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -out_unlock: - unlock_page(page); - return error; -} - -STATIC int -xfs_vm_writepages( - struct address_space *mapping, - struct writeback_control *wbc) -{ - struct bhv_vnode *vp = vn_from_inode(mapping->host); - - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - return generic_writepages(mapping, wbc); -} - -/* - * Called to move a page into cleanable state - and from there - * to be released. Possibly the page is already clean. We always - * have buffer heads in this call. - * - * Returns 0 if the page is ok to release, 1 otherwise. - * - * Possible scenarios are: - * - * 1. We are being called to release a page which has been written - * to via regular I/O. buffer heads will be dirty and possibly - * delalloc. If no delalloc buffer heads in this case then we - * can just return zero. - * - * 2. We are called to release a page which has been written via - * mmap, all we need to do is ensure there is no delalloc - * state in the buffer heads, if not we can let the caller - * free them and we should come back later via writepage. - */ -STATIC int -xfs_vm_releasepage( - struct page *page, - gfp_t gfp_mask) -{ - struct inode *inode = page->mapping->host; - int dirty, delalloc, unmapped, unwritten; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 1, - }; - - xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask); - - if (!page_has_buffers(page)) - return 0; - - xfs_count_page_state(page, &delalloc, &unmapped, &unwritten); - if (!delalloc && !unwritten) - goto free_buffers; - - if (!(gfp_mask & __GFP_FS)) - return 0; - - /* If we are already inside a transaction or the thread cannot - * do I/O, we cannot release this page. - */ - if (current_test_flags(PF_FSTRANS)) - return 0; - - /* - * Convert delalloc space to real space, do not flush the - * data out to disk, that will be done by the caller. - * Never need to allocate space here - we will always - * come back to writepage in that case. - */ - dirty = xfs_page_state_convert(inode, page, &wbc, 0, 0); - if (dirty == 0 && !unwritten) - goto free_buffers; - return 0; - -free_buffers: - return try_to_free_buffers(page); -} - -STATIC int -__xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create, - int direct, - bmapi_flags_t flags) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - xfs_iomap_t iomap; - xfs_off_t offset; - ssize_t size; - int niomap = 1; - int error; - - offset = (xfs_off_t)iblock << inode->i_blkbits; - ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); - size = bh_result->b_size; - error = bhv_vop_bmap(vp, offset, size, - create ? flags : BMAPI_READ, &iomap, &niomap); - if (error) - return -error; - if (niomap == 0) - return 0; - - if (iomap.iomap_bn != IOMAP_DADDR_NULL) { - /* - * For unwritten extents do not report a disk address on - * the read case (treat as if we're reading into a hole). - */ - if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) { - xfs_map_buffer(bh_result, &iomap, offset, - inode->i_blkbits); - } - if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) { - if (direct) - bh_result->b_private = inode; - set_buffer_unwritten(bh_result); - set_buffer_delay(bh_result); - } - } - - /* - * If this is a realtime file, data may be on a different device. - * to that pointed to from the buffer_head b_bdev currently. - */ - bh_result->b_bdev = iomap.iomap_target->bt_bdev; - - /* - * If we previously allocated a block out beyond eof and we are - * now coming back to use it then we will need to flag it as new - * even if it has a disk address. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || (iomap.iomap_flags & IOMAP_NEW))) - set_buffer_new(bh_result); - - if (iomap.iomap_flags & IOMAP_DELAY) { - BUG_ON(direct); - if (create) { - set_buffer_uptodate(bh_result); - set_buffer_mapped(bh_result); - set_buffer_delay(bh_result); - } - } - - if (direct || size > (1 << inode->i_blkbits)) { - ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); - offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, size); - bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); - } - - return 0; -} - -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, - bh_result, create, 0, BMAPI_WRITE); -} - -STATIC int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, - bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); -} - -STATIC void -xfs_end_io_direct( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - xfs_ioend_t *ioend = iocb->private; - - /* - * Non-NULL private data means we need to issue a transaction to - * convert a range from unwritten to written extents. This needs - * to happen from process context but aio+dio I/O completion - * happens from irq context so we need to defer it to a workqueue. - * This is not necessary for synchronous direct I/O, but we do - * it anyway to keep the code uniform and simpler. - * - * The core direct I/O code might be changed to always call the - * completion handler in the future, in which case all this can - * go away. - */ - if (private && size > 0) { - ioend->io_offset = offset; - ioend->io_size = size; - xfs_finish_ioend(ioend); - } else { - ASSERT(size >= 0); - xfs_destroy_ioend(ioend); - } - - /* - * blockdev_direct_IO can return an error even after the I/O - * completion handler was called. Thus we need to protect - * against double-freeing. - */ - iocb->private = NULL; -} - -STATIC ssize_t -xfs_vm_direct_IO( - int rw, - struct kiocb *iocb, - const struct iovec *iov, - loff_t offset, - unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - xfs_iomap_t iomap; - int maps = 1; - int error; - ssize_t ret; - - error = bhv_vop_bmap(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps); - if (error) - return -error; - - iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN); - - if (rw == WRITE) { - ret = blockdev_direct_IO_own_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } else { - ret = blockdev_direct_IO_no_locking(rw, iocb, inode, - iomap.iomap_target->bt_bdev, - iov, offset, nr_segs, - xfs_get_blocks_direct, - xfs_end_io_direct); - } - - if (unlikely(ret <= 0 && iocb->private)) - xfs_destroy_ioend(iocb->private); - return ret; -} - -STATIC int -xfs_vm_prepare_write( - struct file *file, - struct page *page, - unsigned int from, - unsigned int to) -{ - return block_prepare_write(page, from, to, xfs_get_blocks); -} - -STATIC sector_t -xfs_vm_bmap( - struct address_space *mapping, - sector_t block) -{ - struct inode *inode = (struct inode *)mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - bhv_vop_rwlock(vp, VRWLOCK_READ); - bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); - bhv_vop_rwunlock(vp, VRWLOCK_READ); - return generic_block_bmap(mapping, block, xfs_get_blocks); -} - -STATIC int -xfs_vm_readpage( - struct file *unused, - struct page *page) -{ - return mpage_readpage(page, xfs_get_blocks); -} - -STATIC int -xfs_vm_readpages( - struct file *unused, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) -{ - return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); -} - -STATIC void -xfs_vm_invalidatepage( - struct page *page, - unsigned long offset) -{ - xfs_page_trace(XFS_INVALIDPAGE_ENTER, - page->mapping->host, page, offset); - block_invalidatepage(page, offset); -} - -const struct address_space_operations xfs_address_space_operations = { - .readpage = xfs_vm_readpage, - .readpages = xfs_vm_readpages, - .writepage = xfs_vm_writepage, - .writepages = xfs_vm_writepages, - .sync_page = block_sync_page, - .releasepage = xfs_vm_releasepage, - .invalidatepage = xfs_vm_invalidatepage, - .prepare_write = xfs_vm_prepare_write, - .commit_write = generic_commit_write, - .bmap = xfs_vm_bmap, - .direct_IO = xfs_vm_direct_IO, - .migratepage = buffer_migrate_page, -}; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c deleted file mode 100644 index 2af528dcfb0..00000000000 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ /dev/null @@ -1,1855 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <linux/stddef.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/pagemap.h> -#include <linux/init.h> -#include <linux/vmalloc.h> -#include <linux/bio.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/workqueue.h> -#include <linux/percpu.h> -#include <linux/blkdev.h> -#include <linux/hash.h> -#include <linux/kthread.h> -#include <linux/migrate.h> -#include "xfs_linux.h" - -STATIC kmem_zone_t *xfs_buf_zone; -STATIC kmem_shaker_t xfs_buf_shake; -STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); -STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); - -STATIC struct workqueue_struct *xfslogd_workqueue; -struct workqueue_struct *xfsdatad_workqueue; - -#ifdef XFS_BUF_TRACE -void -xfs_buf_trace( - xfs_buf_t *bp, - char *id, - void *data, - void *ra) -{ - ktrace_enter(xfs_buf_trace_buf, - bp, id, - (void *)(unsigned long)bp->b_flags, - (void *)(unsigned long)bp->b_hold.counter, - (void *)(unsigned long)bp->b_sema.count.counter, - (void *)current, - data, ra, - (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), - (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), - (void *)(unsigned long)bp->b_buffer_length, - NULL, NULL, NULL, NULL, NULL); -} -ktrace_t *xfs_buf_trace_buf; -#define XFS_BUF_TRACE_SIZE 4096 -#define XB_TRACE(bp, id, data) \ - xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) -#else -#define XB_TRACE(bp, id, data) do { } while (0) -#endif - -#ifdef XFS_BUF_LOCK_TRACKING -# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) -# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) -# define XB_GET_OWNER(bp) ((bp)->b_last_holder) -#else -# define XB_SET_OWNER(bp) do { } while (0) -# define XB_CLEAR_OWNER(bp) do { } while (0) -# define XB_GET_OWNER(bp) do { } while (0) -#endif - -#define xb_to_gfp(flags) \ - ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ - ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) - -#define xb_to_km(flags) \ - (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) - -#define xfs_buf_allocate(flags) \ - kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) -#define xfs_buf_deallocate(bp) \ - kmem_zone_free(xfs_buf_zone, (bp)); - -/* - * Page Region interfaces. - * - * For pages in filesystems where the blocksize is smaller than the - * pagesize, we use the page->private field (long) to hold a bitmap - * of uptodate regions within the page. - * - * Each such region is "bytes per page / bits per long" bytes long. - * - * NBPPR == number-of-bytes-per-page-region - * BTOPR == bytes-to-page-region (rounded up) - * BTOPRT == bytes-to-page-region-truncated (rounded down) - */ -#if (BITS_PER_LONG == 32) -#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ -#elif (BITS_PER_LONG == 64) -#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ -#else -#error BITS_PER_LONG must be 32 or 64 -#endif -#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) -#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) -#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) - -STATIC unsigned long -page_region_mask( - size_t offset, - size_t length) -{ - unsigned long mask; - int first, final; - - first = BTOPR(offset); - final = BTOPRT(offset + length - 1); - first = min(first, final); - - mask = ~0UL; - mask <<= BITS_PER_LONG - (final - first); - mask >>= BITS_PER_LONG - (final); - - ASSERT(offset + length <= PAGE_CACHE_SIZE); - ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); - - return mask; -} - -STATIC inline void -set_page_region( - struct page *page, - size_t offset, - size_t length) -{ - set_page_private(page, - page_private(page) | page_region_mask(offset, length)); - if (page_private(page) == ~0UL) - SetPageUptodate(page); -} - -STATIC inline int -test_page_region( - struct page *page, - size_t offset, - size_t length) -{ - unsigned long mask = page_region_mask(offset, length); - - return (mask && (page_private(page) & mask) == mask); -} - -/* - * Mapping of multi-page buffers into contiguous virtual space - */ - -typedef struct a_list { - void *vm_addr; - struct a_list *next; -} a_list_t; - -STATIC a_list_t *as_free_head; -STATIC int as_list_len; -STATIC DEFINE_SPINLOCK(as_lock); - -/* - * Try to batch vunmaps because they are costly. - */ -STATIC void -free_address( - void *addr) -{ - a_list_t *aentry; - - aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); - if (likely(aentry)) { - spin_lock(&as_lock); - aentry->next = as_free_head; - aentry->vm_addr = addr; - as_free_head = aentry; - as_list_len++; - spin_unlock(&as_lock); - } else { - vunmap(addr); - } -} - -STATIC void -purge_addresses(void) -{ - a_list_t *aentry, *old; - - if (as_free_head == NULL) - return; - - spin_lock(&as_lock); - aentry = as_free_head; - as_free_head = NULL; - as_list_len = 0; - spin_unlock(&as_lock); - - while ((old = aentry) != NULL) { - vunmap(aentry->vm_addr); - aentry = aentry->next; - kfree(old); - } -} - -/* - * Internal xfs_buf_t object manipulation - */ - -STATIC void -_xfs_buf_initialize( - xfs_buf_t *bp, - xfs_buftarg_t *target, - xfs_off_t range_base, - size_t range_length, - xfs_buf_flags_t flags) -{ - /* - * We don't want certain flags to appear in b_flags. - */ - flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); - - memset(bp, 0, sizeof(xfs_buf_t)); - atomic_set(&bp->b_hold, 1); - init_MUTEX_LOCKED(&bp->b_iodonesema); - INIT_LIST_HEAD(&bp->b_list); - INIT_LIST_HEAD(&bp->b_hash_list); - init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ - XB_SET_OWNER(bp); - bp->b_target = target; - bp->b_file_offset = range_base; - /* - * Set buffer_length and count_desired to the same value initially. - * I/O routines should use count_desired, which will be the same in - * most cases but may be reset (e.g. XFS recovery). - */ - bp->b_buffer_length = bp->b_count_desired = range_length; - bp->b_flags = flags; - bp->b_bn = XFS_BUF_DADDR_NULL; - atomic_set(&bp->b_pin_count, 0); - init_waitqueue_head(&bp->b_waiters); - - XFS_STATS_INC(xb_create); - XB_TRACE(bp, "initialize", target); -} - -/* - * Allocate a page array capable of holding a specified number - * of pages, and point the page buf at it. - */ -STATIC int -_xfs_buf_get_pages( - xfs_buf_t *bp, - int page_count, - xfs_buf_flags_t flags) -{ - /* Make sure that we have a page list */ - if (bp->b_pages == NULL) { - bp->b_offset = xfs_buf_poff(bp->b_file_offset); - bp->b_page_count = page_count; - if (page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kmem_alloc(sizeof(struct page *) * - page_count, xb_to_km(flags)); - if (bp->b_pages == NULL) - return -ENOMEM; - } - memset(bp->b_pages, 0, sizeof(struct page *) * page_count); - } - return 0; -} - -/* - * Frees b_pages if it was allocated. - */ -STATIC void -_xfs_buf_free_pages( - xfs_buf_t *bp) -{ - if (bp->b_pages != bp->b_page_array) { - kmem_free(bp->b_pages, - bp->b_page_count * sizeof(struct page *)); - } -} - -/* - * Releases the specified buffer. - * - * The modification state of any associated pages is left unchanged. - * The buffer most not be on any hash - use xfs_buf_rele instead for - * hashed and refcounted buffers - */ -void -xfs_buf_free( - xfs_buf_t *bp) -{ - XB_TRACE(bp, "free", 0); - - ASSERT(list_empty(&bp->b_hash_list)); - - if (bp->b_flags & _XBF_PAGE_CACHE) { - uint i; - - if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - free_address(bp->b_addr - bp->b_offset); - - for (i = 0; i < bp->b_page_count; i++) - page_cache_release(bp->b_pages[i]); - _xfs_buf_free_pages(bp); - } else if (bp->b_flags & _XBF_KMEM_ALLOC) { - /* - * XXX(hch): bp->b_count_desired might be incorrect (see - * xfs_buf_associate_memory for details), but fortunately - * the Linux version of kmem_free ignores the len argument.. - */ - kmem_free(bp->b_addr, bp->b_count_desired); - _xfs_buf_free_pages(bp); - } - - xfs_buf_deallocate(bp); -} - -/* - * Finds all pages for buffer in question and builds it's page list. - */ -STATIC int -_xfs_buf_lookup_pages( - xfs_buf_t *bp, - uint flags) -{ - struct address_space *mapping = bp->b_target->bt_mapping; - size_t blocksize = bp->b_target->bt_bsize; - size_t size = bp->b_count_desired; - size_t nbytes, offset; - gfp_t gfp_mask = xb_to_gfp(flags); - unsigned short page_count, i; - pgoff_t first; - xfs_off_t end; - int error; - - end = bp->b_file_offset + bp->b_buffer_length; - page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); - - error = _xfs_buf_get_pages(bp, page_count, flags); - if (unlikely(error)) - return error; - bp->b_flags |= _XBF_PAGE_CACHE; - - offset = bp->b_offset; - first = bp->b_file_offset >> PAGE_CACHE_SHIFT; - - for (i = 0; i < bp->b_page_count; i++) { - struct page *page; - uint retries = 0; - - retry: - page = find_or_create_page(mapping, first + i, gfp_mask); - if (unlikely(page == NULL)) { - if (flags & XBF_READ_AHEAD) { - bp->b_page_count = i; - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - return -ENOMEM; - } - - /* - * This could deadlock. - * - * But until all the XFS lowlevel code is revamped to - * handle buffer allocation failures we can't do much. - */ - if (!(++retries % 100)) - printk(KERN_ERR - "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", - __FUNCTION__, gfp_mask); - - XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); - blk_congestion_wait(WRITE, HZ/50); - goto retry; - } - - XFS_STATS_INC(xb_page_found); - - nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); - size -= nbytes; - - if (!PageUptodate(page)) { - page_count--; - if (blocksize >= PAGE_CACHE_SIZE) { - if (flags & XBF_READ) - bp->b_locked = 1; - } else if (!PagePrivate(page)) { - if (test_page_region(page, offset, nbytes)) - page_count++; - } - } - - bp->b_pages[i] = page; - offset = 0; - } - - if (!bp->b_locked) { - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - } - - if (page_count == bp->b_page_count) - bp->b_flags |= XBF_DONE; - - XB_TRACE(bp, "lookup_pages", (long)page_count); - return error; -} - -/* - * Map buffer into kernel address-space if nessecary. - */ -STATIC int -_xfs_buf_map_pages( - xfs_buf_t *bp, - uint flags) -{ - /* A single page buffer is always mappable */ - if (bp->b_page_count == 1) { - bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } else if (flags & XBF_MAPPED) { - if (as_list_len > 64) - purge_addresses(); - bp->b_addr = vmap(bp->b_pages, bp->b_page_count, - VM_MAP, PAGE_KERNEL); - if (unlikely(bp->b_addr == NULL)) - return -ENOMEM; - bp->b_addr += bp->b_offset; - bp->b_flags |= XBF_MAPPED; - } - - return 0; -} - -/* - * Finding and Reading Buffers - */ - -/* - * Look up, and creates if absent, a lockable buffer for - * a given range of an inode. The buffer is returned - * locked. If other overlapping buffers exist, they are - * released before the new buffer is created and locked, - * which may imply that this call will block until those buffers - * are unlocked. No I/O is implied by this call. - */ -xfs_buf_t * -_xfs_buf_find( - xfs_buftarg_t *btp, /* block device target */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - xfs_buf_flags_t flags, - xfs_buf_t *new_bp) -{ - xfs_off_t range_base; - size_t range_length; - xfs_bufhash_t *hash; - xfs_buf_t *bp, *n; - - range_base = (ioff << BBSHIFT); - range_length = (isize << BBSHIFT); - - /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(range_length < (1 << btp->bt_sshift))); - ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); - - hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; - - spin_lock(&hash->bh_lock); - - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (bp->b_file_offset == range_base && - bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ - atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); - goto found; - } - } - - /* No match found */ - if (new_bp) { - _xfs_buf_initialize(new_bp, btp, range_base, - range_length, flags); - new_bp->b_hash = hash; - list_add(&new_bp->b_hash_list, &hash->bh_list); - } else { - XFS_STATS_INC(xb_miss_locked); - } - - spin_unlock(&hash->bh_lock); - return new_bp; - -found: - spin_unlock(&hash->bh_lock); - - /* Attempt to get the semaphore without sleeping, - * if this does not work then we need to drop the - * spinlock and do a hard attempt on the semaphore. - */ - if (down_trylock(&bp->b_sema)) { - if (!(flags & XBF_TRYLOCK)) { - /* wait for buffer ownership */ - XB_TRACE(bp, "get_lock", 0); - xfs_buf_lock(bp); - XFS_STATS_INC(xb_get_locked_waited); - } else { - /* We asked for a trylock and failed, no need - * to look at file offset and length here, we - * know that this buffer at least overlaps our - * buffer and is locked, therefore our buffer - * either does not exist, or is this buffer. - */ - xfs_buf_rele(bp); - XFS_STATS_INC(xb_busy_locked); - return NULL; - } - } else { - /* trylock worked */ - XB_SET_OWNER(bp); - } - - if (bp->b_flags & XBF_STALE) { - ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED; - } - XB_TRACE(bp, "got_lock", 0); - XFS_STATS_INC(xb_get_locked); - return bp; -} - -/* - * Assembles a buffer covering the specified range. - * Storage in memory for all portions of the buffer will be allocated, - * although backing storage may not be. - */ -xfs_buf_t * -xfs_buf_get_flags( - xfs_buftarg_t *target,/* target for buffer */ - xfs_off_t ioff, /* starting offset of range */ - size_t isize, /* length of range */ - xfs_buf_flags_t flags) -{ - xfs_buf_t *bp, *new_bp; - int error = 0, i; - - new_bp = xfs_buf_allocate(flags); - if (unlikely(!new_bp)) - return NULL; - - bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); - if (bp == new_bp) { - error = _xfs_buf_lookup_pages(bp, flags); - if (error) - goto no_buffer; - } else { - xfs_buf_deallocate(new_bp); - if (unlikely(bp == NULL)) - return NULL; - } - - for (i = 0; i < bp->b_page_count; i++) - mark_page_accessed(bp->b_pages[i]); - - if (!(bp->b_flags & XBF_MAPPED)) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - printk(KERN_WARNING "%s: failed to map pages\n", - __FUNCTION__); - goto no_buffer; - } - } - - XFS_STATS_INC(xb_get); - - /* - * Always fill in the block number now, the mapped cases can do - * their own overlay of this later. - */ - bp->b_bn = ioff; - bp->b_count_desired = bp->b_buffer_length; - - XB_TRACE(bp, "get", (unsigned long)flags); - return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; -} - -xfs_buf_t * -xfs_buf_read_flags( - xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, - xfs_buf_flags_t flags) -{ - xfs_buf_t *bp; - - flags |= XBF_READ; - - bp = xfs_buf_get_flags(target, ioff, isize, flags); - if (bp) { - if (!XFS_BUF_ISDONE(bp)) { - XB_TRACE(bp, "read", (unsigned long)flags); - XFS_STATS_INC(xb_get_read); - xfs_buf_iostart(bp, flags); - } else if (flags & XBF_ASYNC) { - XB_TRACE(bp, "read_async", (unsigned long)flags); - /* - * Read ahead call which is already satisfied, - * drop the buffer - */ - goto no_buffer; - } else { - XB_TRACE(bp, "read_done", (unsigned long)flags); - /* We do not want read in the flags */ - bp->b_flags &= ~XBF_READ; - } - } - - return bp; - - no_buffer: - if (flags & (XBF_LOCK | XBF_TRYLOCK)) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); - return NULL; -} - -/* - * If we are not low on memory then do the readahead in a deadlock - * safe manner. - */ -void -xfs_buf_readahead( - xfs_buftarg_t *target, - xfs_off_t ioff, - size_t isize, - xfs_buf_flags_t flags) -{ - struct backing_dev_info *bdi; - - bdi = target->bt_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) - return; - - flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); - xfs_buf_read_flags(target, ioff, isize, flags); -} - -xfs_buf_t * -xfs_buf_get_empty( - size_t len, - xfs_buftarg_t *target) -{ - xfs_buf_t *bp; - - bp = xfs_buf_allocate(0); - if (bp) - _xfs_buf_initialize(bp, target, 0, len, 0); - return bp; -} - -static inline struct page * -mem_to_page( - void *addr) -{ - if (((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END)) { - return virt_to_page(addr); - } else { - return vmalloc_to_page(addr); - } -} - -int -xfs_buf_associate_memory( - xfs_buf_t *bp, - void *mem, - size_t len) -{ - int rval; - int i = 0; - size_t ptr; - size_t end, end_cur; - off_t offset; - int page_count; - - page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; - offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK); - if (offset && (len > PAGE_CACHE_SIZE)) - page_count++; - - /* Free any previous set of page pointers */ - if (bp->b_pages) - _xfs_buf_free_pages(bp); - - bp->b_pages = NULL; - bp->b_addr = mem; - - rval = _xfs_buf_get_pages(bp, page_count, 0); - if (rval) - return rval; - - bp->b_offset = offset; - ptr = (size_t) mem & PAGE_CACHE_MASK; - end = PAGE_CACHE_ALIGN((size_t) mem + len); - end_cur = end; - /* set up first page */ - bp->b_pages[0] = mem_to_page(mem); - - ptr += PAGE_CACHE_SIZE; - bp->b_page_count = ++i; - while (ptr < end) { - bp->b_pages[i] = mem_to_page((void *)ptr); - bp->b_page_count = ++i; - ptr += PAGE_CACHE_SIZE; - } - bp->b_locked = 0; - - bp->b_count_desired = bp->b_buffer_length = len; - bp->b_flags |= XBF_MAPPED; - - return 0; -} - -xfs_buf_t * -xfs_buf_get_noaddr( - size_t len, - xfs_buftarg_t *target) -{ - size_t malloc_len = len; - xfs_buf_t *bp; - void *data; - int error; - - bp = xfs_buf_allocate(0); - if (unlikely(bp == NULL)) - goto fail; - _xfs_buf_initialize(bp, target, 0, len, 0); - - try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL); - if (unlikely(data == NULL)) - goto fail_free_buf; - - /* check whether alignment matches.. */ - if ((__psunsigned_t)data != - ((__psunsigned_t)data & ~target->bt_smask)) { - /* .. else double the size and try again */ - kmem_free(data, malloc_len); - malloc_len <<= 1; - goto try_again; - } - - error = xfs_buf_associate_memory(bp, data, len); - if (error) - goto fail_free_mem; - bp->b_flags |= _XBF_KMEM_ALLOC; - - xfs_buf_unlock(bp); - - XB_TRACE(bp, "no_daddr", data); - return bp; - fail_free_mem: - kmem_free(data, malloc_len); - fail_free_buf: - xfs_buf_free(bp); - fail: - return NULL; -} - -/* - * Increment reference count on buffer, to hold the buffer concurrently - * with another thread which may release (free) the buffer asynchronously. - * Must hold the buffer already to call this function. - */ -void -xfs_buf_hold( - xfs_buf_t *bp) -{ - atomic_inc(&bp->b_hold); - XB_TRACE(bp, "hold", 0); -} - -/* - * Releases a hold on the specified buffer. If the - * the hold count is 1, calls xfs_buf_free. - */ -void -xfs_buf_rele( - xfs_buf_t *bp) -{ - xfs_bufhash_t *hash = bp->b_hash; - - XB_TRACE(bp, "rele", bp->b_relse); - - if (unlikely(!hash)) { - ASSERT(!bp->b_relse); - if (atomic_dec_and_test(&bp->b_hold)) - xfs_buf_free(bp); - return; - } - - if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { - if (bp->b_relse) { - atomic_inc(&bp->b_hold); - spin_unlock(&hash->bh_lock); - (*(bp->b_relse)) (bp); - } else if (bp->b_flags & XBF_FS_MANAGED) { - spin_unlock(&hash->bh_lock); - } else { - ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); - list_del_init(&bp->b_hash_list); - spin_unlock(&hash->bh_lock); - xfs_buf_free(bp); - } - } else { - /* - * Catch reference count leaks - */ - ASSERT(atomic_read(&bp->b_hold) >= 0); - } -} - - -/* - * Mutual exclusion on buffers. Locking model: - * - * Buffers associated with inodes for which buffer locking - * is not enabled are not protected by semaphores, and are - * assumed to be exclusively owned by the caller. There is a - * spinlock in the buffer, used by the caller when concurrent - * access is possible. - */ - -/* - * Locks a buffer object, if it is not already locked. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. - */ -int -xfs_buf_cond_lock( - xfs_buf_t *bp) -{ - int locked; - - locked = down_trylock(&bp->b_sema) == 0; - if (locked) { - XB_SET_OWNER(bp); - } - XB_TRACE(bp, "cond_lock", (long)locked); - return locked ? 0 : -EBUSY; -} - -#if defined(DEBUG) || defined(XFS_BLI_TRACE) -int -xfs_buf_lock_value( - xfs_buf_t *bp) -{ - return atomic_read(&bp->b_sema.count); -} -#endif - -/* - * Locks a buffer object. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. - */ -void -xfs_buf_lock( - xfs_buf_t *bp) -{ - XB_TRACE(bp, "lock", 0); - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); - down(&bp->b_sema); - XB_SET_OWNER(bp); - XB_TRACE(bp, "locked", 0); -} - -/* - * Releases the lock on the buffer object. - * If the buffer is marked delwri but is not queued, do so before we - * unlock the buffer as we need to set flags correctly. We also need to - * take a reference for the delwri queue because the unlocker is going to - * drop their's and they don't know we just queued it. - */ -void -xfs_buf_unlock( - xfs_buf_t *bp) -{ - if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { - atomic_inc(&bp->b_hold); - bp->b_flags |= XBF_ASYNC; - xfs_buf_delwri_queue(bp, 0); - } - - XB_CLEAR_OWNER(bp); - up(&bp->b_sema); - XB_TRACE(bp, "unlock", 0); -} - - -/* - * Pinning Buffer Storage in Memory - * Ensure that no attempt to force a buffer to disk will succeed. - */ -void -xfs_buf_pin( - xfs_buf_t *bp) -{ - atomic_inc(&bp->b_pin_count); - XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); -} - -void -xfs_buf_unpin( - xfs_buf_t *bp) -{ - if (atomic_dec_and_test(&bp->b_pin_count)) - wake_up_all(&bp->b_waiters); - XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); -} - -int -xfs_buf_ispin( - xfs_buf_t *bp) -{ - return atomic_read(&bp->b_pin_count); -} - -STATIC void -xfs_buf_wait_unpin( - xfs_buf_t *bp) -{ - DECLARE_WAITQUEUE (wait, current); - - if (atomic_read(&bp->b_pin_count) == 0) - return; - - add_wait_queue(&bp->b_waiters, &wait); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&bp->b_pin_count) == 0) - break; - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); - schedule(); - } - remove_wait_queue(&bp->b_waiters, &wait); - set_current_state(TASK_RUNNING); -} - -/* - * Buffer Utility Routines - */ - -STATIC void -xfs_buf_iodone_work( - void *v) -{ - xfs_buf_t *bp = (xfs_buf_t *)v; - - if (bp->b_iodone) - (*(bp->b_iodone))(bp); - else if (bp->b_flags & XBF_ASYNC) - xfs_buf_relse(bp); -} - -void -xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) -{ - bp->b_flags &= ~(XBF_READ | XBF_WRITE); - if (bp->b_error == 0) - bp->b_flags |= XBF_DONE; - - XB_TRACE(bp, "iodone", bp->b_iodone); - - if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { - if (schedule) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work, bp); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); - } else { - xfs_buf_iodone_work(bp); - } - } else { - up(&bp->b_iodonesema); - } -} - -void -xfs_buf_ioerror( - xfs_buf_t *bp, - int error) -{ - ASSERT(error >= 0 && error <= 0xffff); - bp->b_error = (unsigned short)error; - XB_TRACE(bp, "ioerror", (unsigned long)error); -} - -/* - * Initiate I/O on a buffer, based on the flags supplied. - * The b_iodone routine in the buffer supplied will only be called - * when all of the subsidiary I/O requests, if any, have been completed. - */ -int -xfs_buf_iostart( - xfs_buf_t *bp, - xfs_buf_flags_t flags) -{ - int status = 0; - - XB_TRACE(bp, "iostart", (unsigned long)flags); - - if (flags & XBF_DELWRI) { - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); - bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); - xfs_buf_delwri_queue(bp, 1); - return status; - } - - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); - bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \ - XBF_READ_AHEAD | _XBF_RUN_QUEUES); - - BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL); - - /* For writes allow an alternate strategy routine to precede - * the actual I/O request (which may not be issued at all in - * a shutdown situation, for example). - */ - status = (flags & XBF_WRITE) ? - xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp); - - /* Wait for I/O if we are not an async request. - * Note: async I/O request completion will release the buffer, - * and that can already be done by this point. So using the - * buffer pointer from here on, after async I/O, is invalid. - */ - if (!status && !(flags & XBF_ASYNC)) - status = xfs_buf_iowait(bp); - - return status; -} - -STATIC __inline__ int -_xfs_buf_iolocked( - xfs_buf_t *bp) -{ - ASSERT(bp->b_flags & (XBF_READ | XBF_WRITE)); - if (bp->b_flags & XBF_READ) - return bp->b_locked; - return 0; -} - -STATIC __inline__ void -_xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) -{ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - bp->b_locked = 0; - xfs_buf_ioend(bp, schedule); - } -} - -STATIC int -xfs_buf_bio_end_io( - struct bio *bio, - unsigned int bytes_done, - int error) -{ - xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - unsigned int blocksize = bp->b_target->bt_bsize; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - - if (bio->bi_size) - return 1; - - if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - bp->b_error = EIO; - - do { - struct page *page = bvec->bv_page; - - if (unlikely(bp->b_error)) { - if (bp->b_flags & XBF_READ) - ClearPageUptodate(page); - SetPageError(page); - } else if (blocksize >= PAGE_CACHE_SIZE) { - SetPageUptodate(page); - } else if (!PagePrivate(page) && - (bp->b_flags & _XBF_PAGE_CACHE)) { - set_page_region(page, bvec->bv_offset, bvec->bv_len); - } - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (_xfs_buf_iolocked(bp)) { - unlock_page(page); - } - } while (bvec >= bio->bi_io_vec); - - _xfs_buf_ioend(bp, 1); - bio_put(bio); - return 0; -} - -STATIC void -_xfs_buf_ioapply( - xfs_buf_t *bp) -{ - int i, rw, map_i, total_nr_pages, nr_pages; - struct bio *bio; - int offset = bp->b_offset; - int size = bp->b_count_desired; - sector_t sector = bp->b_bn; - unsigned int blocksize = bp->b_target->bt_bsize; - int locking = _xfs_buf_iolocked(bp); - - total_nr_pages = bp->b_page_count; - map_i = 0; - - if (bp->b_flags & _XBF_RUN_QUEUES) { - bp->b_flags &= ~_XBF_RUN_QUEUES; - rw = (bp->b_flags & XBF_READ) ? READ_SYNC : WRITE_SYNC; - } else { - rw = (bp->b_flags & XBF_READ) ? READ : WRITE; - } - - if (bp->b_flags & XBF_ORDERED) { - ASSERT(!(bp->b_flags & XBF_READ)); - rw = WRITE_BARRIER; - } - - /* Special code path for reading a sub page size buffer in -- - * we populate up the whole page, and hence the other metadata - * in the same page. This optimization is only valid when the - * filesystem block size is not smaller than the page size. - */ - if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - (bp->b_flags & XBF_READ) && locking && - (blocksize >= PAGE_CACHE_SIZE)) { - bio = bio_alloc(GFP_NOIO, 1); - - bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector - (offset >> BBSHIFT); - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); - size = 0; - - atomic_inc(&bp->b_io_remaining); - - goto submit_io; - } - - /* Lock down the pages which we need to for the request */ - if (locking && (bp->b_flags & XBF_WRITE) && (bp->b_locked == 0)) { - for (i = 0; size; i++) { - int nbytes = PAGE_CACHE_SIZE - offset; - struct page *page = bp->b_pages[i]; - - if (nbytes > size) - nbytes = size; - - lock_page(page); - - size -= nbytes; - offset = 0; - } - offset = bp->b_offset; - size = bp->b_count_desired; - } - -next_chunk: - atomic_inc(&bp->b_io_remaining); - nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); - if (nr_pages > total_nr_pages) - nr_pages = total_nr_pages; - - bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector; - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - for (; size && nr_pages; nr_pages--, map_i++) { - int rbytes, nbytes = PAGE_CACHE_SIZE - offset; - - if (nbytes > size) - nbytes = size; - - rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); - if (rbytes < nbytes) - break; - - offset = 0; - sector += nbytes >> BBSHIFT; - size -= nbytes; - total_nr_pages--; - } - -submit_io: - if (likely(bio->bi_size)) { - submit_bio(rw, bio); - if (size) - goto next_chunk; - } else { - bio_put(bio); - xfs_buf_ioerror(bp, EIO); - } -} - -int -xfs_buf_iorequest( - xfs_buf_t *bp) -{ - XB_TRACE(bp, "iorequest", 0); - - if (bp->b_flags & XBF_DELWRI) { - xfs_buf_delwri_queue(bp, 1); - return 0; - } - - if (bp->b_flags & XBF_WRITE) { - xfs_buf_wait_unpin(bp); - } - - xfs_buf_hold(bp); - - /* Set the count to 1 initially, this will stop an I/O - * completion callout which happens before we have started - * all the I/O from calling xfs_buf_ioend too early. - */ - atomic_set(&bp->b_io_remaining, 1); - _xfs_buf_ioapply(bp); - _xfs_buf_ioend(bp, 0); - - xfs_buf_rele(bp); - return 0; -} - -/* - * Waits for I/O to complete on the buffer supplied. - * It returns immediately if no I/O is pending. - * It returns the I/O error code, if any, or 0 if there was no error. - */ -int -xfs_buf_iowait( - xfs_buf_t *bp) -{ - XB_TRACE(bp, "iowait", 0); - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); - down(&bp->b_iodonesema); - XB_TRACE(bp, "iowaited", (long)bp->b_error); - return bp->b_error; -} - -xfs_caddr_t -xfs_buf_offset( - xfs_buf_t *bp, - size_t offset) -{ - struct page *page; - - if (bp->b_flags & XBF_MAPPED) - return XFS_BUF_PTR(bp) + offset; - - offset += bp->b_offset; - page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); -} - -/* - * Move data into or out of a buffer. - */ -void -xfs_buf_iomove( - xfs_buf_t *bp, /* buffer to process */ - size_t boff, /* starting buffer offset */ - size_t bsize, /* length to copy */ - caddr_t data, /* data address */ - xfs_buf_rw_t mode) /* read/write/zero flag */ -{ - size_t bend, cpoff, csize; - struct page *page; - - bend = boff + bsize; - while (boff < bend) { - page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; - cpoff = xfs_buf_poff(boff + bp->b_offset); - csize = min_t(size_t, - PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); - - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); - - switch (mode) { - case XBRW_ZERO: - memset(page_address(page) + cpoff, 0, csize); - break; - case XBRW_READ: - memcpy(data, page_address(page) + cpoff, csize); - break; - case XBRW_WRITE: - memcpy(page_address(page) + cpoff, data, csize); - } - - boff += csize; - data += csize; - } -} - -/* - * Handling of buffer targets (buftargs). - */ - -/* - * Wait for any bufs with callbacks that have been submitted but - * have not yet returned... walk the hash list for the target. - */ -void -xfs_wait_buftarg( - xfs_buftarg_t *btp) -{ - xfs_buf_t *bp, *n; - xfs_bufhash_t *hash; - uint i; - - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - hash = &btp->bt_hash[i]; -again: - spin_lock(&hash->bh_lock); - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (!(bp->b_flags & XBF_FS_MANAGED)) { - spin_unlock(&hash->bh_lock); - /* - * Catch superblock reference count leaks - * immediately - */ - BUG_ON(bp->b_bn == 0); - delay(100); - goto again; - } - } - spin_unlock(&hash->bh_lock); - } -} - -/* - * Allocate buffer hash table for a given target. - * For devices containing metadata (i.e. not the log/realtime devices) - * we need to allocate a much larger hash table. - */ -STATIC void -xfs_alloc_bufhash( - xfs_buftarg_t *btp, - int external) -{ - unsigned int i; - - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; - btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t), KM_SLEEP); - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - spin_lock_init(&btp->bt_hash[i].bh_lock); - INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); - } -} - -STATIC void -xfs_free_bufhash( - xfs_buftarg_t *btp) -{ - kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t)); - btp->bt_hash = NULL; -} - -/* - * buftarg list for delwrite queue processing - */ -STATIC LIST_HEAD(xfs_buftarg_list); -STATIC DEFINE_SPINLOCK(xfs_buftarg_lock); - -STATIC void -xfs_register_buftarg( - xfs_buftarg_t *btp) -{ - spin_lock(&xfs_buftarg_lock); - list_add(&btp->bt_list, &xfs_buftarg_list); - spin_unlock(&xfs_buftarg_lock); -} - -STATIC void -xfs_unregister_buftarg( - xfs_buftarg_t *btp) -{ - spin_lock(&xfs_buftarg_lock); - list_del(&btp->bt_list); - spin_unlock(&xfs_buftarg_lock); -} - -void -xfs_free_buftarg( - xfs_buftarg_t *btp, - int external) -{ - xfs_flush_buftarg(btp, 1); - if (external) - xfs_blkdev_put(btp->bt_bdev); - xfs_free_bufhash(btp); - iput(btp->bt_mapping->host); - - /* Unregister the buftarg first so that we don't get a - * wakeup finding a non-existent task - */ - xfs_unregister_buftarg(btp); - kthread_stop(btp->bt_task); - - kmem_free(btp, sizeof(*btp)); -} - -STATIC int -xfs_setsize_buftarg_flags( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize, - int verbose) -{ - btp->bt_bsize = blocksize; - btp->bt_sshift = ffs(sectorsize) - 1; - btp->bt_smask = sectorsize - 1; - - if (set_blocksize(btp->bt_bdev, sectorsize)) { - printk(KERN_WARNING - "XFS: Cannot set_blocksize to %u on device %s\n", - sectorsize, XFS_BUFTARG_NAME(btp)); - return EINVAL; - } - - if (verbose && - (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { - printk(KERN_WARNING - "XFS: %u byte sectors in use on device %s. " - "This is suboptimal; %u or greater is ideal.\n", - sectorsize, XFS_BUFTARG_NAME(btp), - (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); - } - - return 0; -} - -/* - * When allocating the initial buffer target we have not yet - * read in the superblock, so don't know what sized sectors - * are being used is at this early stage. Play safe. - */ -STATIC int -xfs_setsize_buftarg_early( - xfs_buftarg_t *btp, - struct block_device *bdev) -{ - return xfs_setsize_buftarg_flags(btp, - PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); -} - -int -xfs_setsize_buftarg( - xfs_buftarg_t *btp, - unsigned int blocksize, - unsigned int sectorsize) -{ - return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); -} - -STATIC int -xfs_mapping_buftarg( - xfs_buftarg_t *btp, - struct block_device *bdev) -{ - struct backing_dev_info *bdi; - struct inode *inode; - struct address_space *mapping; - static const struct address_space_operations mapping_aops = { - .sync_page = block_sync_page, - .migratepage = fail_migrate_page, - }; - - inode = new_inode(bdev->bd_inode->i_sb); - if (!inode) { - printk(KERN_WARNING - "XFS: Cannot allocate mapping inode for device %s\n", - XFS_BUFTARG_NAME(btp)); - return ENOMEM; - } - inode->i_mode = S_IFBLK; - inode->i_bdev = bdev; - inode->i_rdev = bdev->bd_dev; - bdi = blk_get_backing_dev_info(bdev); - if (!bdi) - bdi = &default_backing_dev_info; - mapping = &inode->i_data; - mapping->a_ops = &mapping_aops; - mapping->backing_dev_info = bdi; - mapping_set_gfp_mask(mapping, GFP_NOFS); - btp->bt_mapping = mapping; - return 0; -} - -STATIC int -xfs_alloc_delwrite_queue( - xfs_buftarg_t *btp) -{ - int error = 0; - - INIT_LIST_HEAD(&btp->bt_list); - INIT_LIST_HEAD(&btp->bt_delwrite_queue); - spinlock_init(&btp->bt_delwrite_lock, "delwri_lock"); - btp->bt_flags = 0; - btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); - if (IS_ERR(btp->bt_task)) { - error = PTR_ERR(btp->bt_task); - goto out_error; - } - xfs_register_buftarg(btp); -out_error: - return error; -} - -xfs_buftarg_t * -xfs_alloc_buftarg( - struct block_device *bdev, - int external) -{ - xfs_buftarg_t *btp; - - btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); - - btp->bt_dev = bdev->bd_dev; - btp->bt_bdev = bdev; - if (xfs_setsize_buftarg_early(btp, bdev)) - goto error; - if (xfs_mapping_buftarg(btp, bdev)) - goto error; - if (xfs_alloc_delwrite_queue(btp)) - goto error; - xfs_alloc_bufhash(btp, external); - return btp; - -error: - kmem_free(btp, sizeof(*btp)); - return NULL; -} - - -/* - * Delayed write buffer handling - */ -STATIC void -xfs_buf_delwri_queue( - xfs_buf_t *bp, - int unlock) -{ - struct list_head *dwq = &bp->b_target->bt_delwrite_queue; - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; - - XB_TRACE(bp, "delwri_q", (long)unlock); - ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); - - spin_lock(dwlk); - /* If already in the queue, dequeue and place at tail */ - if (!list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - if (unlock) - atomic_dec(&bp->b_hold); - list_del(&bp->b_list); - } - - bp->b_flags |= _XBF_DELWRI_Q; - list_add_tail(&bp->b_list, dwq); - bp->b_queuetime = jiffies; - spin_unlock(dwlk); - - if (unlock) - xfs_buf_unlock(bp); -} - -void -xfs_buf_delwri_dequeue( - xfs_buf_t *bp) -{ - spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; - int dequeued = 0; - - spin_lock(dwlk); - if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { - ASSERT(bp->b_flags & _XBF_DELWRI_Q); - list_del_init(&bp->b_list); - dequeued = 1; - } - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - spin_unlock(dwlk); - - if (dequeued) - xfs_buf_rele(bp); - - XB_TRACE(bp, "delwri_dq", (long)dequeued); -} - -STATIC void -xfs_buf_runall_queues( - struct workqueue_struct *queue) -{ - flush_workqueue(queue); -} - -STATIC int -xfsbufd_wakeup( - int priority, - gfp_t mask) -{ - xfs_buftarg_t *btp; - - spin_lock(&xfs_buftarg_lock); - list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { - if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) - continue; - set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); - wake_up_process(btp->bt_task); - } - spin_unlock(&xfs_buftarg_lock); - return 0; -} - -STATIC int -xfsbufd( - void *data) -{ - struct list_head tmp; - unsigned long age; - xfs_buftarg_t *target = (xfs_buftarg_t *)data; - xfs_buf_t *bp, *n; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; - - current->flags |= PF_MEMALLOC; - - INIT_LIST_HEAD(&tmp); - do { - if (unlikely(freezing(current))) { - set_bit(XBT_FORCE_SLEEP, &target->bt_flags); - refrigerator(); - } else { - clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); - } - - schedule_timeout_interruptible( - xfs_buf_timer_centisecs * msecs_to_jiffies(10)); - - age = xfs_buf_age_centisecs * msecs_to_jiffies(10); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); - ASSERT(bp->b_flags & XBF_DELWRI); - - if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { - if (!test_bit(XBT_FORCE_FLUSH, - &target->bt_flags) && - time_before(jiffies, - bp->b_queuetime + age)) { - xfs_buf_unlock(bp); - break; - } - - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - list_move(&bp->b_list, &tmp); - } - } - spin_unlock(dwlk); - - while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - ASSERT(target == bp->b_target); - - list_del_init(&bp->b_list); - xfs_buf_iostrategy(bp); - - blk_run_address_space(target->bt_mapping); - } - - if (as_list_len > 0) - purge_addresses(); - - clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); - } while (!kthread_should_stop()); - - return 0; -} - -/* - * Go through all incore buffers, and release buffers if they belong to - * the given device. This is used in filesystem error handling to - * preserve the consistency of its metadata. - */ -int -xfs_flush_buftarg( - xfs_buftarg_t *target, - int wait) -{ - struct list_head tmp; - xfs_buf_t *bp, *n; - int pincount = 0; - struct list_head *dwq = &target->bt_delwrite_queue; - spinlock_t *dwlk = &target->bt_delwrite_lock; - - xfs_buf_runall_queues(xfsdatad_workqueue); - xfs_buf_runall_queues(xfslogd_workqueue); - - INIT_LIST_HEAD(&tmp); - spin_lock(dwlk); - list_for_each_entry_safe(bp, n, dwq, b_list) { - ASSERT(bp->b_target == target); - ASSERT(bp->b_flags & (XBF_DELWRI | _XBF_DELWRI_Q)); - XB_TRACE(bp, "walkq2", (long)xfs_buf_ispin(bp)); - if (xfs_buf_ispin(bp)) { - pincount++; - continue; - } - - list_move(&bp->b_list, &tmp); - } - spin_unlock(dwlk); - - /* - * Dropped the delayed write list lock, now walk the temporary list - */ - list_for_each_entry_safe(bp, n, &tmp, b_list) { - xfs_buf_lock(bp); - bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); - bp->b_flags |= XBF_WRITE; - if (wait) - bp->b_flags &= ~XBF_ASYNC; - else - list_del_init(&bp->b_list); - - xfs_buf_iostrategy(bp); - } - - /* - * Remaining list items must be flushed before returning - */ - while (!list_empty(&tmp)) { - bp = list_entry(tmp.next, xfs_buf_t, b_list); - - list_del_init(&bp->b_list); - xfs_iowait(bp); - xfs_buf_relse(bp); - } - - if (wait) - blk_run_address_space(target->bt_mapping); - - return pincount; -} - -int __init -xfs_buf_init(void) -{ -#ifdef XFS_BUF_TRACE - xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); -#endif - - xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", - KM_ZONE_HWALIGN, NULL); - if (!xfs_buf_zone) - goto out_free_trace_buf; - - xfslogd_workqueue = create_workqueue("xfslogd"); - if (!xfslogd_workqueue) - goto out_free_buf_zone; - - xfsdatad_workqueue = create_workqueue("xfsdatad"); - if (!xfsdatad_workqueue) - goto out_destroy_xfslogd_workqueue; - - xfs_buf_shake = kmem_shake_register(xfsbufd_wakeup); - if (!xfs_buf_shake) - goto out_destroy_xfsdatad_workqueue; - - return 0; - - out_destroy_xfsdatad_workqueue: - destroy_workqueue(xfsdatad_workqueue); - out_destroy_xfslogd_workqueue: - destroy_workqueue(xfslogd_workqueue); - out_free_buf_zone: - kmem_zone_destroy(xfs_buf_zone); - out_free_trace_buf: -#ifdef XFS_BUF_TRACE - ktrace_free(xfs_buf_trace_buf); -#endif - return -ENOMEM; -} - -void -xfs_buf_terminate(void) -{ - kmem_shake_deregister(xfs_buf_shake); - destroy_workqueue(xfsdatad_workqueue); - destroy_workqueue(xfslogd_workqueue); - kmem_zone_destroy(xfs_buf_zone); -#ifdef XFS_BUF_TRACE - ktrace_free(xfs_buf_trace_buf); -#endif -} diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h deleted file mode 100644 index 7858703ed84..00000000000 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_BUF_H__ -#define __XFS_BUF_H__ - -#include <linux/list.h> -#include <linux/types.h> -#include <linux/spinlock.h> -#include <asm/system.h> -#include <linux/mm.h> -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/uio.h> - -/* - * Base types - */ - -#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) - -#define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) -#define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -#define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) -#define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) - -typedef enum { - XBRW_READ = 1, /* transfer into target memory */ - XBRW_WRITE = 2, /* transfer from target memory */ - XBRW_ZERO = 3, /* Zero target memory */ -} xfs_buf_rw_t; - -typedef enum { - XBF_READ = (1 << 0), /* buffer intended for reading from device */ - XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ - XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ - XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ - XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ - XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ - XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ - XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ - XBF_ORDERED = (1 << 11), /* use ordered writes */ - XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ - - /* flags used only as arguments to access routines */ - XBF_LOCK = (1 << 14), /* lock requested */ - XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ - XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ - - /* flags used only internally */ - _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ - _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ - _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ -} xfs_buf_flags_t; - -typedef enum { - XBT_FORCE_SLEEP = (0 << 1), - XBT_FORCE_FLUSH = (1 << 1), -} xfs_buftarg_flags_t; - -typedef struct xfs_bufhash { - struct list_head bh_list; - spinlock_t bh_lock; -} xfs_bufhash_t; - -typedef struct xfs_buftarg { - dev_t bt_dev; - struct block_device *bt_bdev; - struct address_space *bt_mapping; - unsigned int bt_bsize; - unsigned int bt_sshift; - size_t bt_smask; - - /* per device buffer hash table */ - uint bt_hashmask; - uint bt_hashshift; - xfs_bufhash_t *bt_hash; - - /* per device delwri queue */ - struct task_struct *bt_task; - struct list_head bt_list; - struct list_head bt_delwrite_queue; - spinlock_t bt_delwrite_lock; - unsigned long bt_flags; -} xfs_buftarg_t; - -/* - * xfs_buf_t: Buffer structure for pagecache-based buffers - * - * This buffer structure is used by the pagecache buffer management routines - * to refer to an assembly of pages forming a logical buffer. - * - * The buffer structure is used on a temporary basis only, and discarded when - * released. The real data storage is recorded in the pagecache. Buffers are - * hashed to the block device on which the file system resides. - */ - -struct xfs_buf; -typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); -typedef void (*xfs_buf_relse_t)(struct xfs_buf *); -typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); - -#define XB_PAGES 2 - -typedef struct xfs_buf { - struct semaphore b_sema; /* semaphore for lockables */ - unsigned long b_queuetime; /* time buffer was queued */ - atomic_t b_pin_count; /* pin count */ - wait_queue_head_t b_waiters; /* unpin waiters */ - struct list_head b_list; - xfs_buf_flags_t b_flags; /* status flags */ - struct list_head b_hash_list; /* hash table list */ - xfs_bufhash_t *b_hash; /* hash table list start */ - xfs_buftarg_t *b_target; /* buffer target (device) */ - atomic_t b_hold; /* reference count */ - xfs_daddr_t b_bn; /* block number for I/O */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ - size_t b_count_desired;/* desired transfer size */ - void *b_addr; /* virtual address of buffer */ - struct work_struct b_iodone_work; - atomic_t b_io_remaining; /* #outstanding I/O requests */ - xfs_buf_iodone_t b_iodone; /* I/O completion function */ - xfs_buf_relse_t b_relse; /* releasing function */ - xfs_buf_bdstrat_t b_strat; /* pre-write function */ - struct semaphore b_iodonesema; /* Semaphore for I/O waiters */ - void *b_fspriv; - void *b_fspriv2; - void *b_fspriv3; - unsigned short b_error; /* error code on I/O */ - unsigned short b_locked; /* page array is locked */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset in first page */ - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ -#ifdef XFS_BUF_LOCK_TRACKING - int b_last_holder; -#endif -} xfs_buf_t; - - -/* Finding and Reading Buffers */ -extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t, xfs_buf_t *); -#define xfs_incore(buftarg,blkno,len,lockit) \ - _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) - -extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -#define xfs_buf_get(target, blkno, len, flags) \ - xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) - -extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); -#define xfs_buf_read(target, blkno, len, flags) \ - xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) - -extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); -extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); -extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); -extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); - -/* Releasing Buffers */ -extern void xfs_buf_free(xfs_buf_t *); -extern void xfs_buf_rele(xfs_buf_t *); - -/* Locking and Unlocking Buffers */ -extern int xfs_buf_cond_lock(xfs_buf_t *); -extern int xfs_buf_lock_value(xfs_buf_t *); -extern void xfs_buf_lock(xfs_buf_t *); -extern void xfs_buf_unlock(xfs_buf_t *); - -/* Buffer Read and Write Routines */ -extern void xfs_buf_ioend(xfs_buf_t *, int); -extern void xfs_buf_ioerror(xfs_buf_t *, int); -extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t); -extern int xfs_buf_iorequest(xfs_buf_t *); -extern int xfs_buf_iowait(xfs_buf_t *); -extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, - xfs_buf_rw_t); - -static inline int xfs_buf_iostrategy(xfs_buf_t *bp) -{ - return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp); -} - -static inline int xfs_buf_geterror(xfs_buf_t *bp) -{ - return bp ? bp->b_error : ENOMEM; -} - -/* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); - -/* Pinning Buffer Storage in Memory */ -extern void xfs_buf_pin(xfs_buf_t *); -extern void xfs_buf_unpin(xfs_buf_t *); -extern int xfs_buf_ispin(xfs_buf_t *); - -/* Delayed Write Buffer Routines */ -extern void xfs_buf_delwri_dequeue(xfs_buf_t *); - -/* Buffer Daemon Setup Routines */ -extern int xfs_buf_init(void); -extern void xfs_buf_terminate(void); - -#ifdef XFS_BUF_TRACE -extern ktrace_t *xfs_buf_trace_buf; -extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); -#else -#define xfs_buf_trace(bp,id,ptr,ra) do { } while (0) -#endif - -#define xfs_buf_target_name(target) \ - ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) - - -#define XFS_B_ASYNC XBF_ASYNC -#define XFS_B_DELWRI XBF_DELWRI -#define XFS_B_READ XBF_READ -#define XFS_B_WRITE XBF_WRITE -#define XFS_B_STALE XBF_STALE - -#define XFS_BUF_TRYLOCK XBF_TRYLOCK -#define XFS_INCORE_TRYLOCK XBF_TRYLOCK -#define XFS_BUF_LOCK XBF_LOCK -#define XFS_BUF_MAPPED XBF_MAPPED - -#define BUF_BUSY XBF_DONT_BLOCK - -#define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) -#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ - ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) - -#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) -#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) -#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) -#define XFS_BUF_SUPER_STALE(bp) do { \ - XFS_BUF_STALE(bp); \ - xfs_buf_delwri_dequeue(bp); \ - XFS_BUF_DONE(bp); \ - } while (0) - -#define XFS_BUF_MANAGE XBF_FS_MANAGED -#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) - -#define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) -#define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) -#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) - -#define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) -#define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) -#define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) - -#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) -#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) -#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) - -#define XFS_BUF_BUSY(bp) do { } while (0) -#define XFS_BUF_UNBUSY(bp) do { } while (0) -#define XFS_BUF_ISBUSY(bp) (1) - -#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) -#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) -#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) - -#define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED) -#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) -#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) - -#define XFS_BUF_SHUT(bp) do { } while (0) -#define XFS_BUF_UNSHUT(bp) do { } while (0) -#define XFS_BUF_ISSHUT(bp) (0) - -#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) -#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) -#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) -#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) - -#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) -#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) -#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) - -#define XFS_BUF_ISUNINITIAL(bp) (0) -#define XFS_BUF_UNUNINITIAL(bp) (0) - -#define XFS_BUF_BP_ISMAPPED(bp) (1) - -#define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) -#define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) -#define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) -#define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func)) -#define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL) - -#define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) -#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) -#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) -#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) -#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3) -#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val)) -#define XFS_BUF_SET_START(bp) do { } while (0) -#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) - -#define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) -#define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) -#define XFS_BUF_ADDR(bp) ((bp)->b_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) -#define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) -#define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) -#define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) -#define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) -#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) -#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) - -#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) -#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) -#define XFS_BUF_SET_REF(bp, ref) do { } while (0) - -#define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) - -#define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) -#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) -#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) -#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) -#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema); - -#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) -#define XFS_BUF_TARGET(bp) ((bp)->b_target) -#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) - -static inline int xfs_bawrite(void *mp, xfs_buf_t *bp) -{ - bp->b_fspriv3 = mp; - bp->b_strat = xfs_bdstrat_cb; - xfs_buf_delwri_dequeue(bp); - return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); -} - -static inline void xfs_buf_relse(xfs_buf_t *bp) -{ - if (!bp->b_relse) - xfs_buf_unlock(bp); - xfs_buf_rele(bp); -} - -#define xfs_bpin(bp) xfs_buf_pin(bp) -#define xfs_bunpin(bp) xfs_buf_unpin(bp) - -#define xfs_buftrace(id, bp) \ - xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) - -#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) - -#define xfs_biomove(bp, off, len, data, rw) \ - xfs_buf_iomove((bp), (off), (len), (data), \ - ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) - -#define xfs_biozero(bp, off, len) \ - xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) - - -static inline int XFS_bwrite(xfs_buf_t *bp) -{ - int iowait = (bp->b_flags & XBF_ASYNC) == 0; - int error = 0; - - if (!iowait) - bp->b_flags |= _XBF_RUN_QUEUES; - - xfs_buf_delwri_dequeue(bp); - xfs_buf_iostrategy(bp); - if (iowait) { - error = xfs_buf_iowait(bp); - xfs_buf_relse(bp); - } - return error; -} - -#define XFS_bdwrite(bp) xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC) - -static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) -{ - bp->b_strat = xfs_bdstrat_cb; - bp->b_fspriv3 = mp; - return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); -} - -#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) - -#define xfs_iowait(bp) xfs_buf_iowait(bp) - -#define xfs_baread(target, rablkno, ralen) \ - xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK) - - -/* - * Handling of buftargs. - */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *, int); -extern void xfs_wait_buftarg(xfs_buftarg_t *); -extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); -extern int xfs_flush_buftarg(xfs_buftarg_t *, int); - -#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) -#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) - -#define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) -#define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) - -#endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h deleted file mode 100644 index e7f3da61c6c..00000000000 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CRED_H__ -#define __XFS_CRED_H__ - -#include <linux/capability.h> - -/* - * Credentials - */ -typedef struct cred { - /* EMPTY */ -} cred_t; - -extern struct cred *sys_cred; - -/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ -static __inline int capable_cred(cred_t *cr, int cid) -{ - return (cr == sys_cred) ? 1 : capable(cid); -} - -#endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c deleted file mode 100644 index 5fb75d9151f..00000000000 --- a/fs/xfs/linux-2.6/xfs_export.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2004-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_types.h" -#include "xfs_dmapi.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_mount.h" -#include "xfs_export.h" - -STATIC struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, }; - -/* - * XFS encodes and decodes the fileid portion of NFS filehandles - * itself instead of letting the generic NFS code do it. This - * allows filesystems with 64 bit inode numbers to be exported. - * - * Note that a side effect is that xfs_vget() won't be passed a - * zero inode/generation pair under normal circumstances. As - * however a malicious client could send us such data, the check - * remains in that code. - */ - -STATIC struct dentry * -xfs_fs_decode_fh( - struct super_block *sb, - __u32 *fh, - int fh_len, - int fileid_type, - int (*acceptable)( - void *context, - struct dentry *de), - void *context) -{ - xfs_fid2_t ifid; - xfs_fid2_t pfid; - void *parent = NULL; - int is64 = 0; - __u32 *p = fh; - -#if XFS_BIG_INUMS - is64 = (fileid_type & XFS_FILEID_TYPE_64FLAG); - fileid_type &= ~XFS_FILEID_TYPE_64FLAG; -#endif - - /* - * Note that we only accept fileids which are long enough - * rather than allow the parent generation number to default - * to zero. XFS considers zero a valid generation number not - * an invalid/wildcard value. There's little point printk'ing - * a warning here as we don't have the client information - * which would make such a warning useful. - */ - if (fileid_type > 2 || - fh_len < xfs_fileid_length((fileid_type == 2), is64)) - return NULL; - - p = xfs_fileid_decode_fid2(p, &ifid, is64); - - if (fileid_type == 2) { - p = xfs_fileid_decode_fid2(p, &pfid, is64); - parent = &pfid; - } - - fh = (__u32 *)&ifid; - return sb->s_export_op->find_exported_dentry(sb, fh, parent, acceptable, context); -} - - -STATIC int -xfs_fs_encode_fh( - struct dentry *dentry, - __u32 *fh, - int *max_len, - int connectable) -{ - struct inode *inode = dentry->d_inode; - int type = 1; - __u32 *p = fh; - int len; - int is64 = 0; -#if XFS_BIG_INUMS - bhv_vfs_t *vfs = vfs_from_sb(inode->i_sb); - - if (!(vfs->vfs_flag & VFS_32BITINODES)) { - /* filesystem may contain 64bit inode numbers */ - is64 = XFS_FILEID_TYPE_64FLAG; - } -#endif - - /* Directories don't need their parent encoded, they have ".." */ - if (S_ISDIR(inode->i_mode)) - connectable = 0; - - /* - * Only encode if there is enough space given. In practice - * this means we can't export a filesystem with 64bit inodes - * over NFSv2 with the subtree_check export option; the other - * seven combinations work. The real answer is "don't use v2". - */ - len = xfs_fileid_length(connectable, is64); - if (*max_len < len) - return 255; - *max_len = len; - - p = xfs_fileid_encode_inode(p, inode, is64); - if (connectable) { - spin_lock(&dentry->d_lock); - p = xfs_fileid_encode_inode(p, dentry->d_parent->d_inode, is64); - spin_unlock(&dentry->d_lock); - type = 2; - } - BUG_ON((p - fh) != len); - return type | is64; -} - -STATIC struct dentry * -xfs_fs_get_dentry( - struct super_block *sb, - void *data) -{ - bhv_vnode_t *vp; - struct inode *inode; - struct dentry *result; - bhv_vfs_t *vfsp = vfs_from_sb(sb); - int error; - - error = bhv_vfs_vget(vfsp, &vp, (fid_t *)data); - if (error || vp == NULL) - return ERR_PTR(-ESTALE) ; - - inode = vn_to_inode(vp); - result = d_alloc_anon(inode); - if (!result) { - iput(inode); - return ERR_PTR(-ENOMEM); - } - return result; -} - -STATIC struct dentry * -xfs_fs_get_parent( - struct dentry *child) -{ - int error; - bhv_vnode_t *vp, *cvp; - struct dentry *parent; - - cvp = NULL; - vp = vn_from_inode(child->d_inode); - error = bhv_vop_lookup(vp, &dotdot, &cvp, 0, NULL, NULL); - if (unlikely(error)) - return ERR_PTR(-error); - - parent = d_alloc_anon(vn_to_inode(cvp)); - if (unlikely(!parent)) { - VN_RELE(cvp); - return ERR_PTR(-ENOMEM); - } - return parent; -} - -struct export_operations xfs_export_operations = { - .decode_fh = xfs_fs_decode_fh, - .encode_fh = xfs_fs_encode_fh, - .get_parent = xfs_fs_get_parent, - .get_dentry = xfs_fs_get_dentry, -}; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c deleted file mode 100644 index 41cfcba7ce4..00000000000 --- a/fs/xfs/linux-2.6/xfs_file.c +++ /dev/null @@ -1,610 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_trans.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_ioctl32.h" - -#include <linux/dcache.h> -#include <linux/smp_lock.h> - -static struct vm_operations_struct xfs_file_vm_ops; -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops; -#endif - -STATIC inline ssize_t -__xfs_file_read( - struct kiocb *iocb, - char __user *buf, - int ioflags, - size_t count, - loff_t pos) -{ - struct iovec iov = {buf, count}; - struct file *file = iocb->ki_filp; - bhv_vnode_t *vp = vn_from_inode(file->f_dentry->d_inode); - - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - return bhv_vop_read(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL); -} - -STATIC ssize_t -xfs_file_aio_read( - struct kiocb *iocb, - char __user *buf, - size_t count, - loff_t pos) -{ - return __xfs_file_read(iocb, buf, IO_ISAIO, count, pos); -} - -STATIC ssize_t -xfs_file_aio_read_invis( - struct kiocb *iocb, - char __user *buf, - size_t count, - loff_t pos) -{ - return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); -} - -STATIC inline ssize_t -__xfs_file_write( - struct kiocb *iocb, - const char __user *buf, - int ioflags, - size_t count, - loff_t pos) -{ - struct iovec iov = {(void __user *)buf, count}; - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - - BUG_ON(iocb->ki_pos != pos); - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - return bhv_vop_write(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL); -} - -STATIC ssize_t -xfs_file_aio_write( - struct kiocb *iocb, - const char __user *buf, - size_t count, - loff_t pos) -{ - return __xfs_file_write(iocb, buf, IO_ISAIO, count, pos); -} - -STATIC ssize_t -xfs_file_aio_write_invis( - struct kiocb *iocb, - const char __user *buf, - size_t count, - loff_t pos) -{ - return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); -} - -STATIC inline ssize_t -__xfs_file_readv( - struct file *file, - const struct iovec *iov, - int ioflags, - unsigned long nr_segs, - loff_t *ppos) -{ - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - struct kiocb kiocb; - ssize_t rval; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - rval = bhv_vop_read(vp, &kiocb, iov, nr_segs, - &kiocb.ki_pos, ioflags, NULL); - - *ppos = kiocb.ki_pos; - return rval; -} - -STATIC ssize_t -xfs_file_readv( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_readv(file, iov, 0, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_readv_invis( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos); -} - -STATIC inline ssize_t -__xfs_file_writev( - struct file *file, - const struct iovec *iov, - int ioflags, - unsigned long nr_segs, - loff_t *ppos) -{ - struct inode *inode = file->f_mapping->host; - bhv_vnode_t *vp = vn_from_inode(inode); - struct kiocb kiocb; - ssize_t rval; - - init_sync_kiocb(&kiocb, file); - kiocb.ki_pos = *ppos; - if (unlikely(file->f_flags & O_DIRECT)) - ioflags |= IO_ISDIRECT; - - rval = bhv_vop_write(vp, &kiocb, iov, nr_segs, - &kiocb.ki_pos, ioflags, NULL); - - *ppos = kiocb.ki_pos; - return rval; -} - -STATIC ssize_t -xfs_file_writev( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_writev(file, iov, 0, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_writev_invis( - struct file *file, - const struct iovec *iov, - unsigned long nr_segs, - loff_t *ppos) -{ - return __xfs_file_writev(file, iov, IO_INVIS, nr_segs, ppos); -} - -STATIC ssize_t -xfs_file_sendfile( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) -{ - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), - filp, pos, 0, count, actor, target, NULL); -} - -STATIC ssize_t -xfs_file_sendfile_invis( - struct file *filp, - loff_t *pos, - size_t count, - read_actor_t actor, - void *target) -{ - return bhv_vop_sendfile(vn_from_inode(filp->f_dentry->d_inode), - filp, pos, IO_INVIS, count, actor, target, NULL); -} - -STATIC ssize_t -xfs_file_splice_read( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), - infilp, ppos, pipe, len, flags, 0, NULL); -} - -STATIC ssize_t -xfs_file_splice_read_invis( - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, - unsigned int flags) -{ - return bhv_vop_splice_read(vn_from_inode(infilp->f_dentry->d_inode), - infilp, ppos, pipe, len, flags, IO_INVIS, - NULL); -} - -STATIC ssize_t -xfs_file_splice_write( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), - pipe, outfilp, ppos, len, flags, 0, NULL); -} - -STATIC ssize_t -xfs_file_splice_write_invis( - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t len, - unsigned int flags) -{ - return bhv_vop_splice_write(vn_from_inode(outfilp->f_dentry->d_inode), - pipe, outfilp, ppos, len, flags, IO_INVIS, - NULL); -} - -STATIC int -xfs_file_open( - struct inode *inode, - struct file *filp) -{ - if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) - return -EFBIG; - return -bhv_vop_open(vn_from_inode(inode), NULL); -} - -STATIC int -xfs_file_close( - struct file *filp, - fl_owner_t id) -{ - return -bhv_vop_close(vn_from_inode(filp->f_dentry->d_inode), 0, - file_count(filp) > 1 ? L_FALSE : L_TRUE, NULL); -} - -STATIC int -xfs_file_release( - struct inode *inode, - struct file *filp) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - - if (vp) - return -bhv_vop_release(vp); - return 0; -} - -STATIC int -xfs_file_fsync( - struct file *filp, - struct dentry *dentry, - int datasync) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - int flags = FSYNC_WAIT; - - if (datasync) - flags |= FSYNC_DATA; - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - return -bhv_vop_fsync(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1); -} - -#ifdef CONFIG_XFS_DMAPI -STATIC struct page * -xfs_vm_nopage( - struct vm_area_struct *area, - unsigned long address, - int *type) -{ - struct inode *inode = area->vm_file->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - - ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI); - if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), area, 0)) - return NULL; - return filemap_nopage(area, address, type); -} -#endif /* CONFIG_XFS_DMAPI */ - -STATIC int -xfs_file_readdir( - struct file *filp, - void *dirent, - filldir_t filldir) -{ - int error = 0; - bhv_vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); - uio_t uio; - iovec_t iov; - int eof = 0; - caddr_t read_buf; - int namelen, size = 0; - size_t rlen = PAGE_CACHE_SIZE; - xfs_off_t start_offset, curr_offset; - xfs_dirent_t *dbp = NULL; - - /* Try fairly hard to get memory */ - do { - if ((read_buf = kmalloc(rlen, GFP_KERNEL))) - break; - rlen >>= 1; - } while (rlen >= 1024); - - if (read_buf == NULL) - return -ENOMEM; - - uio.uio_iov = &iov; - uio.uio_segflg = UIO_SYSSPACE; - curr_offset = filp->f_pos; - if (filp->f_pos != 0x7fffffff) - uio.uio_offset = filp->f_pos; - else - uio.uio_offset = 0xffffffff; - - while (!eof) { - uio.uio_resid = iov.iov_len = rlen; - iov.iov_base = read_buf; - uio.uio_iovcnt = 1; - - start_offset = uio.uio_offset; - - error = bhv_vop_readdir(vp, &uio, NULL, &eof); - if ((uio.uio_offset == start_offset) || error) { - size = 0; - break; - } - - size = rlen - uio.uio_resid; - dbp = (xfs_dirent_t *)read_buf; - while (size > 0) { - namelen = strlen(dbp->d_name); - - if (filldir(dirent, dbp->d_name, namelen, - (loff_t) curr_offset & 0x7fffffff, - (ino_t) dbp->d_ino, - DT_UNKNOWN)) { - goto done; - } - size -= dbp->d_reclen; - curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; - dbp = (xfs_dirent_t *)((char *)dbp + dbp->d_reclen); - } - } -done: - if (!error) { - if (size == 0) - filp->f_pos = uio.uio_offset & 0x7fffffff; - else if (dbp) - filp->f_pos = curr_offset; - } - - kfree(read_buf); - return -error; -} - -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - -#ifdef CONFIG_XFS_DMAPI - if (vn_from_inode(filp->f_dentry->d_inode)->v_vfsp->vfs_flag & VFS_DMI) - vma->vm_ops = &xfs_dmapi_file_vm_ops; -#endif /* CONFIG_XFS_DMAPI */ - - file_accessed(filp); - return 0; -} - -STATIC long -xfs_file_ioctl( - struct file *filp, - unsigned int cmd, - unsigned long p) -{ - int error; - struct inode *inode = filp->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - - error = bhv_vop_ioctl(vp, inode, filp, 0, cmd, (void __user *)p); - VMODIFY(vp); - - /* NOTE: some of the ioctl's return positive #'s as a - * byte count indicating success, such as - * readlink_by_handle. So we don't "sign flip" - * like most other routines. This means true - * errors need to be returned as a negative value. - */ - return error; -} - -STATIC long -xfs_file_ioctl_invis( - struct file *filp, - unsigned int cmd, - unsigned long p) -{ - int error; - struct inode *inode = filp->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - - error = bhv_vop_ioctl(vp, inode, filp, IO_INVIS, cmd, (void __user *)p); - VMODIFY(vp); - - /* NOTE: some of the ioctl's return positive #'s as a - * byte count indicating success, such as - * readlink_by_handle. So we don't "sign flip" - * like most other routines. This means true - * errors need to be returned as a negative value. - */ - return error; -} - -#ifdef CONFIG_XFS_DMAPI -#ifdef HAVE_VMOP_MPROTECT -STATIC int -xfs_vm_mprotect( - struct vm_area_struct *vma, - unsigned int newflags) -{ - bhv_vnode_t *vp = vn_from_inode(vma->vm_file->f_dentry->d_inode); - int error = 0; - - if (vp->v_vfsp->vfs_flag & VFS_DMI) { - if ((vma->vm_flags & VM_MAYSHARE) && - (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) { - xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); - - error = XFS_SEND_MMAP(mp, vma, VM_WRITE); - } - } - return error; -} -#endif /* HAVE_VMOP_MPROTECT */ -#endif /* CONFIG_XFS_DMAPI */ - -#ifdef HAVE_FOP_OPEN_EXEC -/* If the user is attempting to execute a file that is offline then - * we have to trigger a DMAPI READ event before the file is marked as busy - * otherwise the invisible I/O will not be able to write to the file to bring - * it back online. - */ -STATIC int -xfs_file_open_exec( - struct inode *inode) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - - if (unlikely(vp->v_vfsp->vfs_flag & VFS_DMI)) { - xfs_mount_t *mp = XFS_VFSTOM(vp->v_vfsp); - xfs_inode_t *ip = xfs_vtoi(vp); - - if (!ip) - return -EINVAL; - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ)) - return -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, - 0, 0, 0, NULL); - } - return 0; -} -#endif /* HAVE_FOP_OPEN_EXEC */ - -const struct file_operations xfs_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .readv = xfs_file_readv, - .writev = xfs_file_writev, - .aio_read = xfs_file_aio_read, - .aio_write = xfs_file_aio_write, - .sendfile = xfs_file_sendfile, - .splice_read = xfs_file_splice_read, - .splice_write = xfs_file_splice_write, - .unlocked_ioctl = xfs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = xfs_file_compat_ioctl, -#endif - .mmap = xfs_file_mmap, - .open = xfs_file_open, - .flush = xfs_file_close, - .release = xfs_file_release, - .fsync = xfs_file_fsync, -#ifdef HAVE_FOP_OPEN_EXEC - .open_exec = xfs_file_open_exec, -#endif -}; - -const struct file_operations xfs_invis_file_operations = { - .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .readv = xfs_file_readv_invis, - .writev = xfs_file_writev_invis, - .aio_read = xfs_file_aio_read_invis, - .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile_invis, - .splice_read = xfs_file_splice_read_invis, - .splice_write = xfs_file_splice_write_invis, - .unlocked_ioctl = xfs_file_ioctl_invis, -#ifdef CONFIG_COMPAT - .compat_ioctl = xfs_file_compat_invis_ioctl, -#endif - .mmap = xfs_file_mmap, - .open = xfs_file_open, - .flush = xfs_file_close, - .release = xfs_file_release, - .fsync = xfs_file_fsync, -}; - - -const struct file_operations xfs_dir_file_operations = { - .read = generic_read_dir, - .readdir = xfs_file_readdir, - .unlocked_ioctl = xfs_file_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = xfs_file_compat_ioctl, -#endif - .fsync = xfs_file_fsync, -}; - -static struct vm_operations_struct xfs_file_vm_ops = { - .nopage = filemap_nopage, - .populate = filemap_populate, -}; - -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops = { - .nopage = xfs_vm_nopage, - .populate = filemap_populate, -#ifdef HAVE_VMOP_MPROTECT - .mprotect = xfs_vm_mprotect, -#endif -}; -#endif /* CONFIG_XFS_DMAPI */ diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c deleted file mode 100644 index dc0562828e7..00000000000 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" - -int fs_noerr(void) { return 0; } -int fs_nosys(void) { return ENOSYS; } -void fs_noval(void) { return; } - -void -fs_tosspages( - bhv_desc_t *bdp, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); - - if (VN_CACHED(vp)) - truncate_inode_pages(ip->i_mapping, first); -} - -void -fs_flushinval_pages( - bhv_desc_t *bdp, - xfs_off_t first, - xfs_off_t last, - int fiopt) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); - - if (VN_CACHED(vp)) { - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - filemap_write_and_wait(ip->i_mapping); - truncate_inode_pages(ip->i_mapping, first); - } -} - -int -fs_flush_pages( - bhv_desc_t *bdp, - xfs_off_t first, - xfs_off_t last, - uint64_t flags, - int fiopt) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - struct inode *ip = vn_to_inode(vp); - - if (VN_DIRTY(vp)) { - if (VN_TRUNC(vp)) - VUNTRUNCATE(vp); - filemap_fdatawrite(ip->i_mapping); - if (flags & XFS_B_ASYNC) - return 0; - filemap_fdatawait(ip->i_mapping); - } - return 0; -} diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.h b/fs/xfs/linux-2.6/xfs_fs_subr.h deleted file mode 100644 index aee9ccdd18f..00000000000 --- a/fs/xfs/linux-2.6/xfs_fs_subr.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_FS_SUBR_H__ -#define __XFS_FS_SUBR_H__ - -struct cred; -extern int fs_noerr(void); -extern int fs_nosys(void); -extern void fs_noval(void); -extern void fs_tosspages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -extern void fs_flushinval_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -extern int fs_flush_pages(bhv_desc_t *, xfs_off_t, xfs_off_t, uint64_t, int); - -#endif /* __XFS_FS_SUBR_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h deleted file mode 100644 index e1a22bfcf86..00000000000 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_GLOBALS_H__ -#define __XFS_GLOBALS_H__ - -extern uint64_t xfs_panic_mask; /* set to cause more panics */ -extern unsigned long xfs_physmem; -extern struct cred *sys_cred; - -#endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c deleted file mode 100644 index 6e52a5dd38d..00000000000 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ /dev/null @@ -1,1353 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_itable.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_bmap.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_dfrag.h" -#include "xfs_fsops.h" - -#include <linux/capability.h> -#include <linux/dcache.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <linux/pagemap.h> - -/* - * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to - * a file or fs handle. - * - * XFS_IOC_PATH_TO_FSHANDLE - * returns fs handle for a mount point or path within that mount point - * XFS_IOC_FD_TO_HANDLE - * returns full handle for a FD opened in user space - * XFS_IOC_PATH_TO_HANDLE - * returns full handle for a path - */ -STATIC int -xfs_find_handle( - unsigned int cmd, - void __user *arg) -{ - int hsize; - xfs_handle_t handle; - xfs_fsop_handlereq_t hreq; - struct inode *inode; - bhv_vnode_t *vp; - - if (copy_from_user(&hreq, arg, sizeof(hreq))) - return -XFS_ERROR(EFAULT); - - memset((char *)&handle, 0, sizeof(handle)); - - switch (cmd) { - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_PATH_TO_HANDLE: { - struct nameidata nd; - int error; - - error = user_path_walk_link((const char __user *)hreq.path, &nd); - if (error) - return error; - - ASSERT(nd.dentry); - ASSERT(nd.dentry->d_inode); - inode = igrab(nd.dentry->d_inode); - path_release(&nd); - break; - } - - case XFS_IOC_FD_TO_HANDLE: { - struct file *file; - - file = fget(hreq.fd); - if (!file) - return -EBADF; - - ASSERT(file->f_dentry); - ASSERT(file->f_dentry->d_inode); - inode = igrab(file->f_dentry->d_inode); - fput(file); - break; - } - - default: - ASSERT(0); - return -XFS_ERROR(EINVAL); - } - - if (inode->i_sb->s_magic != XFS_SB_MAGIC) { - /* we're not in XFS anymore, Toto */ - iput(inode); - return -XFS_ERROR(EINVAL); - } - - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - break; - default: - iput(inode); - return -XFS_ERROR(EBADF); - } - - /* we need the vnode */ - vp = vn_from_inode(inode); - - /* now we can grab the fsid */ - memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t)); - hsize = sizeof(xfs_fsid_t); - - if (cmd != XFS_IOC_PATH_TO_FSHANDLE) { - xfs_inode_t *ip; - int lock_mode; - - /* need to get access to the xfs_inode to read the generation */ - ip = xfs_vtoi(vp); - ASSERT(ip); - lock_mode = xfs_ilock_map_shared(ip); - - /* fill in fid section of handle from inode */ - handle.ha_fid.xfs_fid_len = sizeof(xfs_fid_t) - - sizeof(handle.ha_fid.xfs_fid_len); - handle.ha_fid.xfs_fid_pad = 0; - handle.ha_fid.xfs_fid_gen = ip->i_d.di_gen; - handle.ha_fid.xfs_fid_ino = ip->i_ino; - - xfs_iunlock_map_shared(ip, lock_mode); - - hsize = XFS_HSIZE(handle); - } - - /* now copy our handle into the user buffer & write out the size */ - if (copy_to_user(hreq.ohandle, &handle, hsize) || - copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) { - iput(inode); - return -XFS_ERROR(EFAULT); - } - - iput(inode); - return 0; -} - - -/* - * Convert userspace handle data into vnode (and inode). - * We [ab]use the fact that all the fsop_handlereq ioctl calls - * have a data structure argument whose first component is always - * a xfs_fsop_handlereq_t, so we can cast to and from this type. - * This allows us to optimise the copy_from_user calls and gives - * a handy, shared routine. - * - * If no error, caller must always VN_RELE the returned vp. - */ -STATIC int -xfs_vget_fsop_handlereq( - xfs_mount_t *mp, - struct inode *parinode, /* parent inode pointer */ - xfs_fsop_handlereq_t *hreq, - bhv_vnode_t **vp, - struct inode **inode) -{ - void __user *hanp; - size_t hlen; - xfs_fid_t *xfid; - xfs_handle_t *handlep; - xfs_handle_t handle; - xfs_inode_t *ip; - struct inode *inodep; - bhv_vnode_t *vpp; - xfs_ino_t ino; - __u32 igen; - int error; - - /* - * Only allow handle opens under a directory. - */ - if (!S_ISDIR(parinode->i_mode)) - return XFS_ERROR(ENOTDIR); - - hanp = hreq->ihandle; - hlen = hreq->ihandlen; - handlep = &handle; - - if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep)) - return XFS_ERROR(EINVAL); - if (copy_from_user(handlep, hanp, hlen)) - return XFS_ERROR(EFAULT); - if (hlen < sizeof(*handlep)) - memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen); - if (hlen > sizeof(handlep->ha_fsid)) { - if (handlep->ha_fid.xfs_fid_len != - (hlen - sizeof(handlep->ha_fsid) - - sizeof(handlep->ha_fid.xfs_fid_len)) - || handlep->ha_fid.xfs_fid_pad) - return XFS_ERROR(EINVAL); - } - - /* - * Crack the handle, obtain the inode # & generation # - */ - xfid = (struct xfs_fid *)&handlep->ha_fid; - if (xfid->xfs_fid_len == sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) { - ino = xfid->xfs_fid_ino; - igen = xfid->xfs_fid_gen; - } else { - return XFS_ERROR(EINVAL); - } - - /* - * Get the XFS inode, building a vnode to go with it. - */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) - return error; - if (ip == NULL) - return XFS_ERROR(EIO); - if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - return XFS_ERROR(ENOENT); - } - - vpp = XFS_ITOV(ip); - inodep = vn_to_inode(vpp); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - *vp = vpp; - *inode = inodep; - return 0; -} - -STATIC int -xfs_open_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct file *parfilp, - struct inode *parinode) -{ - int error; - int new_fd; - int permflag; - struct file *filp; - struct inode *inode; - struct dentry *dentry; - bhv_vnode_t *vp; - xfs_fsop_handlereq_t hreq; - - if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); - if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); - - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); - if (error) - return -error; - - /* Restrict xfs_open_by_handle to directories & regular files. */ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - iput(inode); - return -XFS_ERROR(EINVAL); - } - -#if BITS_PER_LONG != 32 - hreq.oflags |= O_LARGEFILE; -#endif - /* Put open permission in namei format. */ - permflag = hreq.oflags; - if ((permflag+1) & O_ACCMODE) - permflag++; - if (permflag & O_TRUNC) - permflag |= 2; - - if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && - (permflag & FMODE_WRITE) && IS_APPEND(inode)) { - iput(inode); - return -XFS_ERROR(EPERM); - } - - if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) { - iput(inode); - return -XFS_ERROR(EACCES); - } - - /* Can't write directories. */ - if ( S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) { - iput(inode); - return -XFS_ERROR(EISDIR); - } - - if ((new_fd = get_unused_fd()) < 0) { - iput(inode); - return new_fd; - } - - dentry = d_alloc_anon(inode); - if (dentry == NULL) { - iput(inode); - put_unused_fd(new_fd); - return -XFS_ERROR(ENOMEM); - } - - /* Ensure umount returns EBUSY on umounts while this file is open. */ - mntget(parfilp->f_vfsmnt); - - /* Create file pointer. */ - filp = dentry_open(dentry, parfilp->f_vfsmnt, hreq.oflags); - if (IS_ERR(filp)) { - put_unused_fd(new_fd); - return -XFS_ERROR(-PTR_ERR(filp)); - } - if (inode->i_mode & S_IFREG) - filp->f_op = &xfs_invis_file_operations; - - fd_install(new_fd, filp); - return new_fd; -} - -STATIC int -xfs_readlink_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct file *parfilp, - struct inode *parinode) -{ - int error; - struct iovec aiov; - struct uio auio; - struct inode *inode; - xfs_fsop_handlereq_t hreq; - bhv_vnode_t *vp; - __u32 olen; - - if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); - if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) - return -XFS_ERROR(EFAULT); - - error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &vp, &inode); - if (error) - return -error; - - /* Restrict this handle operation to symlinks only. */ - if (!S_ISLNK(inode->i_mode)) { - VN_RELE(vp); - return -XFS_ERROR(EINVAL); - } - - if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) { - VN_RELE(vp); - return -XFS_ERROR(EFAULT); - } - aiov.iov_len = olen; - aiov.iov_base = hreq.ohandle; - - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = 0; - auio.uio_segflg = UIO_USERSPACE; - auio.uio_resid = olen; - - error = bhv_vop_readlink(vp, &auio, IO_INVIS, NULL); - VN_RELE(vp); - if (error) - return -error; - - return (olen - auio.uio_resid); -} - -STATIC int -xfs_fssetdm_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct file *parfilp, - struct inode *parinode) -{ - int error; - struct fsdmidata fsd; - xfs_fsop_setdm_handlereq_t dmhreq; - struct inode *inode; - bhv_desc_t *bdp; - bhv_vnode_t *vp; - - if (!capable(CAP_MKNOD)) - return -XFS_ERROR(EPERM); - if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) - return -XFS_ERROR(EFAULT); - - error = xfs_vget_fsop_handlereq(mp, parinode, &dmhreq.hreq, &vp, &inode); - if (error) - return -error; - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) { - VN_RELE(vp); - return -XFS_ERROR(EPERM); - } - - if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { - VN_RELE(vp); - return -XFS_ERROR(EFAULT); - } - - bdp = bhv_base_unlocked(VN_BHV_HEAD(vp)); - error = xfs_set_dmattrs(bdp, fsd.fsd_dmevmask, fsd.fsd_dmstate, NULL); - - VN_RELE(vp); - if (error) - return -error; - return 0; -} - -STATIC int -xfs_attrlist_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct file *parfilp, - struct inode *parinode) -{ - int error; - attrlist_cursor_kern_t *cursor; - xfs_fsop_attrlist_handlereq_t al_hreq; - struct inode *inode; - bhv_vnode_t *vp; - char *kbuf; - - if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); - if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) - return -XFS_ERROR(EFAULT); - if (al_hreq.buflen > XATTR_LIST_MAX) - return -XFS_ERROR(EINVAL); - - error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, - &vp, &inode); - if (error) - goto out; - - kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL); - if (!kbuf) - goto out_vn_rele; - - cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = bhv_vop_attr_list(vp, kbuf, al_hreq.buflen, al_hreq.flags, - cursor, NULL); - if (error) - goto out_kfree; - - if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) - error = -EFAULT; - - out_kfree: - kfree(kbuf); - out_vn_rele: - VN_RELE(vp); - out: - return -error; -} - -STATIC int -xfs_attrmulti_attr_get( - bhv_vnode_t *vp, - char *name, - char __user *ubuf, - __uint32_t *len, - __uint32_t flags) -{ - char *kbuf; - int error = EFAULT; - - if (*len > XATTR_SIZE_MAX) - return EINVAL; - kbuf = kmalloc(*len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; - - error = bhv_vop_attr_get(vp, name, kbuf, len, flags, NULL); - if (error) - goto out_kfree; - - if (copy_to_user(ubuf, kbuf, *len)) - error = EFAULT; - - out_kfree: - kfree(kbuf); - return error; -} - -STATIC int -xfs_attrmulti_attr_set( - bhv_vnode_t *vp, - char *name, - const char __user *ubuf, - __uint32_t len, - __uint32_t flags) -{ - char *kbuf; - int error = EFAULT; - - if (IS_RDONLY(&vp->v_inode)) - return -EROFS; - if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) - return EPERM; - if (len > XATTR_SIZE_MAX) - return EINVAL; - - kbuf = kmalloc(len, GFP_KERNEL); - if (!kbuf) - return ENOMEM; - - if (copy_from_user(kbuf, ubuf, len)) - goto out_kfree; - - error = bhv_vop_attr_set(vp, name, kbuf, len, flags, NULL); - - out_kfree: - kfree(kbuf); - return error; -} - -STATIC int -xfs_attrmulti_attr_remove( - bhv_vnode_t *vp, - char *name, - __uint32_t flags) -{ - if (IS_RDONLY(&vp->v_inode)) - return -EROFS; - if (IS_IMMUTABLE(&vp->v_inode) || IS_APPEND(&vp->v_inode)) - return EPERM; - return bhv_vop_attr_remove(vp, name, flags, NULL); -} - -STATIC int -xfs_attrmulti_by_handle( - xfs_mount_t *mp, - void __user *arg, - struct file *parfilp, - struct inode *parinode) -{ - int error; - xfs_attr_multiop_t *ops; - xfs_fsop_attrmulti_handlereq_t am_hreq; - struct inode *inode; - bhv_vnode_t *vp; - unsigned int i, size; - char *attr_name; - - if (!capable(CAP_SYS_ADMIN)) - return -XFS_ERROR(EPERM); - if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) - return -XFS_ERROR(EFAULT); - - error = xfs_vget_fsop_handlereq(mp, parinode, &am_hreq.hreq, &vp, &inode); - if (error) - goto out; - - error = E2BIG; - size = am_hreq.opcount * sizeof(attr_multiop_t); - if (!size || size > 16 * PAGE_SIZE) - goto out_vn_rele; - - error = ENOMEM; - ops = kmalloc(size, GFP_KERNEL); - if (!ops) - goto out_vn_rele; - - error = EFAULT; - if (copy_from_user(ops, am_hreq.ops, size)) - goto out_kfree_ops; - - attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); - if (!attr_name) - goto out_kfree_ops; - - - error = 0; - for (i = 0; i < am_hreq.opcount; i++) { - ops[i].am_error = strncpy_from_user(attr_name, - ops[i].am_attrname, MAXNAMELEN); - if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) - error = -ERANGE; - if (ops[i].am_error < 0) - break; - - switch (ops[i].am_opcode) { - case ATTR_OP_GET: - ops[i].am_error = xfs_attrmulti_attr_get(vp, - attr_name, ops[i].am_attrvalue, - &ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_SET: - ops[i].am_error = xfs_attrmulti_attr_set(vp, - attr_name, ops[i].am_attrvalue, - ops[i].am_length, ops[i].am_flags); - break; - case ATTR_OP_REMOVE: - ops[i].am_error = xfs_attrmulti_attr_remove(vp, - attr_name, ops[i].am_flags); - break; - default: - ops[i].am_error = EINVAL; - } - } - - if (copy_to_user(am_hreq.ops, ops, size)) - error = XFS_ERROR(EFAULT); - - kfree(attr_name); - out_kfree_ops: - kfree(ops); - out_vn_rele: - VN_RELE(vp); - out: - return -error; -} - -/* prototypes for a few of the stack-hungry cases that have - * their own functions. Functions are defined after their use - * so gcc doesn't get fancy and inline them with -03 */ - -STATIC int -xfs_ioc_space( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - struct file *filp, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_bulkstat( - xfs_mount_t *mp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_fsgeometry( - xfs_mount_t *mp, - void __user *arg); - -STATIC int -xfs_ioc_xattr( - bhv_vnode_t *vp, - xfs_inode_t *ip, - struct file *filp, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_getbmap( - bhv_desc_t *bdp, - struct file *filp, - int flags, - unsigned int cmd, - void __user *arg); - -STATIC int -xfs_ioc_getbmapx( - bhv_desc_t *bdp, - void __user *arg); - -int -xfs_ioctl( - bhv_desc_t *bdp, - struct inode *inode, - struct file *filp, - int ioflags, - unsigned int cmd, - void __user *arg) -{ - int error; - bhv_vnode_t *vp; - xfs_inode_t *ip; - xfs_mount_t *mp; - - vp = vn_from_inode(inode); - - vn_trace_entry(vp, "xfs_ioctl", (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - switch (cmd) { - - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - /* - * Only allow the sys admin to reserve space unless - * unwritten extents are enabled. - */ - if (!XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); - - case XFS_IOC_DIOINFO: { - struct dioattr da; - xfs_buftarg_t *target = - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - da.d_mem = da.d_miniosz = 1 << target->bt_sshift; - da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); - - if (copy_to_user(arg, &da, sizeof(da))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - return xfs_ioc_bulkstat(mp, cmd, arg); - - case XFS_IOC_FSGEOMETRY_V1: - return xfs_ioc_fsgeometry_v1(mp, arg); - - case XFS_IOC_FSGEOMETRY: - return xfs_ioc_fsgeometry(mp, arg); - - case XFS_IOC_GETVERSION: - case XFS_IOC_GETXFLAGS: - case XFS_IOC_SETXFLAGS: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - return xfs_ioc_xattr(vp, ip, filp, cmd, arg); - - case XFS_IOC_FSSETDM: { - struct fsdmidata dmi; - - if (copy_from_user(&dmi, arg, sizeof(dmi))) - return -XFS_ERROR(EFAULT); - - error = xfs_set_dmattrs(bdp, dmi.fsd_dmevmask, dmi.fsd_dmstate, - NULL); - return -error; - } - - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); - - case XFS_IOC_GETBMAPX: - return xfs_ioc_getbmapx(bdp, arg); - - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - return xfs_find_handle(cmd, arg); - - case XFS_IOC_OPEN_BY_HANDLE: - return xfs_open_by_handle(mp, arg, filp, inode); - - case XFS_IOC_FSSETDM_BY_HANDLE: - return xfs_fssetdm_by_handle(mp, arg, filp, inode); - - case XFS_IOC_READLINK_BY_HANDLE: - return xfs_readlink_by_handle(mp, arg, filp, inode); - - case XFS_IOC_ATTRLIST_BY_HANDLE: - return xfs_attrlist_by_handle(mp, arg, filp, inode); - - case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, filp, inode); - - case XFS_IOC_SWAPEXT: { - error = xfs_swapext((struct xfs_swapext __user *)arg); - return -error; - } - - case XFS_IOC_FSCOUNTS: { - xfs_fsop_counts_t out; - - error = xfs_fs_counts(mp, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_SET_RESBLKS: { - xfs_fsop_resblks_t inout; - __uint64_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&inout, arg, sizeof(inout))) - return -XFS_ERROR(EFAULT); - - /* input parameter is passed in resblks field of structure */ - in = inout.resblks; - error = xfs_reserve_blocks(mp, &in, &inout); - if (error) - return -error; - - if (copy_to_user(arg, &inout, sizeof(inout))) - return -XFS_ERROR(EFAULT); - return 0; - } - - case XFS_IOC_GET_RESBLKS: { - xfs_fsop_resblks_t out; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_reserve_blocks(mp, NULL, &out); - if (error) - return -error; - - if (copy_to_user(arg, &out, sizeof(out))) - return -XFS_ERROR(EFAULT); - - return 0; - } - - case XFS_IOC_FSGROWFSDATA: { - xfs_growfs_data_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_data(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSLOG: { - xfs_growfs_log_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_log(mp, &in); - return -error; - } - - case XFS_IOC_FSGROWFSRT: { - xfs_growfs_rt_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_growfs_rt(mp, &in); - return -error; - } - - case XFS_IOC_FREEZE: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (inode->i_sb->s_frozen == SB_UNFROZEN) - freeze_bdev(inode->i_sb->s_bdev); - return 0; - - case XFS_IOC_THAW: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (inode->i_sb->s_frozen != SB_UNFROZEN) - thaw_bdev(inode->i_sb->s_bdev, inode->i_sb); - return 0; - - case XFS_IOC_GOINGDOWN: { - __uint32_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (get_user(in, (__uint32_t __user *)arg)) - return -XFS_ERROR(EFAULT); - - error = xfs_fs_goingdown(mp, in); - return -error; - } - - case XFS_IOC_ERROR_INJECTION: { - xfs_error_injection_t in; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (copy_from_user(&in, arg, sizeof(in))) - return -XFS_ERROR(EFAULT); - - error = xfs_errortag_add(in.errtag, mp); - return -error; - } - - case XFS_IOC_ERROR_CLEARALL: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - error = xfs_errortag_clearall(mp); - return -error; - - default: - return -ENOTTY; - } -} - -STATIC int -xfs_ioc_space( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - struct file *filp, - int ioflags, - unsigned int cmd, - void __user *arg) -{ - xfs_flock64_t bf; - int attr_flags = 0; - int error; - - if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) - return -XFS_ERROR(EPERM); - - if (!(filp->f_mode & FMODE_WRITE)) - return -XFS_ERROR(EBADF); - - if (!VN_ISREG(vp)) - return -XFS_ERROR(EINVAL); - - if (copy_from_user(&bf, arg, sizeof(bf))) - return -XFS_ERROR(EFAULT); - - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; - if (ioflags & IO_INVIS) - attr_flags |= ATTR_DMI; - - error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, - NULL, attr_flags); - return -error; -} - -STATIC int -xfs_ioc_bulkstat( - xfs_mount_t *mp, - unsigned int cmd, - void __user *arg) -{ - xfs_fsop_bulkreq_t bulkreq; - int count; /* # of records returned */ - xfs_ino_t inlast; /* last inode number */ - int done; - int error; - - /* done = 1 if there are more stats to get and if bulkstat */ - /* should be called again (unused here, but used in dmapi) */ - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -XFS_ERROR(EIO); - - if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) - return -XFS_ERROR(EFAULT); - - if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) - return -XFS_ERROR(EFAULT); - - if ((count = bulkreq.icount) <= 0) - return -XFS_ERROR(EINVAL); - - if (cmd == XFS_IOC_FSINUMBERS) - error = xfs_inumbers(mp, &inlast, &count, - bulkreq.ubuffer); - else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) - error = xfs_bulkstat_single(mp, &inlast, - bulkreq.ubuffer, &done); - else { /* XFS_IOC_FSBULKSTAT */ - if (count == 1 && inlast != 0) { - inlast++; - error = xfs_bulkstat_single(mp, &inlast, - bulkreq.ubuffer, &done); - } else { - error = xfs_bulkstat(mp, &inlast, &count, - (bulkstat_one_pf)xfs_bulkstat_one, NULL, - sizeof(xfs_bstat_t), bulkreq.ubuffer, - BULKSTAT_FG_QUICK, &done); - } - } - - if (error) - return -error; - - if (bulkreq.ocount != NULL) { - if (copy_to_user(bulkreq.lastip, &inlast, - sizeof(xfs_ino_t))) - return -XFS_ERROR(EFAULT); - - if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) - return -XFS_ERROR(EFAULT); - } - - return 0; -} - -STATIC int -xfs_ioc_fsgeometry_v1( - xfs_mount_t *mp, - void __user *arg) -{ - xfs_fsop_geom_v1_t fsgeo; - int error; - - error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); - if (error) - return -error; - - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) - return -XFS_ERROR(EFAULT); - return 0; -} - -STATIC int -xfs_ioc_fsgeometry( - xfs_mount_t *mp, - void __user *arg) -{ - xfs_fsop_geom_t fsgeo; - int error; - - error = xfs_fs_geometry(mp, &fsgeo, 4); - if (error) - return -error; - - if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) - return -XFS_ERROR(EFAULT); - return 0; -} - -/* - * Linux extended inode flags interface. - */ -#define LINUX_XFLAG_SYNC 0x00000008 /* Synchronous updates */ -#define LINUX_XFLAG_IMMUTABLE 0x00000010 /* Immutable file */ -#define LINUX_XFLAG_APPEND 0x00000020 /* writes to file may only append */ -#define LINUX_XFLAG_NODUMP 0x00000040 /* do not dump file */ -#define LINUX_XFLAG_NOATIME 0x00000080 /* do not update atime */ - -STATIC unsigned int -xfs_merge_ioc_xflags( - unsigned int flags, - unsigned int start) -{ - unsigned int xflags = start; - - if (flags & LINUX_XFLAG_IMMUTABLE) - xflags |= XFS_XFLAG_IMMUTABLE; - else - xflags &= ~XFS_XFLAG_IMMUTABLE; - if (flags & LINUX_XFLAG_APPEND) - xflags |= XFS_XFLAG_APPEND; - else - xflags &= ~XFS_XFLAG_APPEND; - if (flags & LINUX_XFLAG_SYNC) - xflags |= XFS_XFLAG_SYNC; - else - xflags &= ~XFS_XFLAG_SYNC; - if (flags & LINUX_XFLAG_NOATIME) - xflags |= XFS_XFLAG_NOATIME; - else - xflags &= ~XFS_XFLAG_NOATIME; - if (flags & LINUX_XFLAG_NODUMP) - xflags |= XFS_XFLAG_NODUMP; - else - xflags &= ~XFS_XFLAG_NODUMP; - - return xflags; -} - -STATIC unsigned int -xfs_di2lxflags( - __uint16_t di_flags) -{ - unsigned int flags = 0; - - if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= LINUX_XFLAG_IMMUTABLE; - if (di_flags & XFS_DIFLAG_APPEND) - flags |= LINUX_XFLAG_APPEND; - if (di_flags & XFS_DIFLAG_SYNC) - flags |= LINUX_XFLAG_SYNC; - if (di_flags & XFS_DIFLAG_NOATIME) - flags |= LINUX_XFLAG_NOATIME; - if (di_flags & XFS_DIFLAG_NODUMP) - flags |= LINUX_XFLAG_NODUMP; - return flags; -} - -STATIC int -xfs_ioc_xattr( - bhv_vnode_t *vp, - xfs_inode_t *ip, - struct file *filp, - unsigned int cmd, - void __user *arg) -{ - struct fsxattr fa; - struct bhv_vattr *vattr; - int error = 0; - int attr_flags; - unsigned int flags; - - vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); - if (unlikely(!vattr)) - return -ENOMEM; - - switch (cmd) { - case XFS_IOC_FSGETXATTR: { - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ - XFS_AT_NEXTENTS | XFS_AT_PROJID; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (unlikely(error)) { - error = -error; - break; - } - - fa.fsx_xflags = vattr->va_xflags; - fa.fsx_extsize = vattr->va_extsize; - fa.fsx_nextents = vattr->va_nextents; - fa.fsx_projid = vattr->va_projid; - - if (copy_to_user(arg, &fa, sizeof(fa))) { - error = -EFAULT; - break; - } - break; - } - - case XFS_IOC_FSSETXATTR: { - if (copy_from_user(&fa, arg, sizeof(fa))) { - error = -EFAULT; - break; - } - - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; - - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; - vattr->va_xflags = fa.fsx_xflags; - vattr->va_extsize = fa.fsx_extsize; - vattr->va_projid = fa.fsx_projid; - - error = bhv_vop_setattr(vp, vattr, attr_flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, vattr); /* update flags */ - error = -error; - break; - } - - case XFS_IOC_FSGETXATTRA: { - vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | \ - XFS_AT_ANEXTENTS | XFS_AT_PROJID; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (unlikely(error)) { - error = -error; - break; - } - - fa.fsx_xflags = vattr->va_xflags; - fa.fsx_extsize = vattr->va_extsize; - fa.fsx_nextents = vattr->va_anextents; - fa.fsx_projid = vattr->va_projid; - - if (copy_to_user(arg, &fa, sizeof(fa))) { - error = -EFAULT; - break; - } - break; - } - - case XFS_IOC_GETXFLAGS: { - flags = xfs_di2lxflags(ip->i_d.di_flags); - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; - } - - case XFS_IOC_SETXFLAGS: { - if (copy_from_user(&flags, arg, sizeof(flags))) { - error = -EFAULT; - break; - } - - if (flags & ~(LINUX_XFLAG_IMMUTABLE | LINUX_XFLAG_APPEND | \ - LINUX_XFLAG_NOATIME | LINUX_XFLAG_NODUMP | \ - LINUX_XFLAG_SYNC)) { - error = -EOPNOTSUPP; - break; - } - - attr_flags = 0; - if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) - attr_flags |= ATTR_NONBLOCK; - - vattr->va_mask = XFS_AT_XFLAGS; - vattr->va_xflags = xfs_merge_ioc_xflags(flags, - xfs_ip2xflags(ip)); - - error = bhv_vop_setattr(vp, vattr, attr_flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, vattr); /* update flags */ - error = -error; - break; - } - - case XFS_IOC_GETVERSION: { - flags = vn_to_inode(vp)->i_generation; - if (copy_to_user(arg, &flags, sizeof(flags))) - error = -EFAULT; - break; - } - - default: - error = -ENOTTY; - break; - } - - kfree(vattr); - return error; -} - -STATIC int -xfs_ioc_getbmap( - bhv_desc_t *bdp, - struct file *filp, - int ioflags, - unsigned int cmd, - void __user *arg) -{ - struct getbmap bm; - int iflags; - int error; - - if (copy_from_user(&bm, arg, sizeof(bm))) - return -XFS_ERROR(EFAULT); - - if (bm.bmv_count < 2) - return -XFS_ERROR(EINVAL); - - iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (ioflags & IO_INVIS) - iflags |= BMV_IF_NO_DMAPI_READ; - - error = xfs_getbmap(bdp, &bm, (struct getbmap __user *)arg+1, iflags); - if (error) - return -error; - - if (copy_to_user(arg, &bm, sizeof(bm))) - return -XFS_ERROR(EFAULT); - return 0; -} - -STATIC int -xfs_ioc_getbmapx( - bhv_desc_t *bdp, - void __user *arg) -{ - struct getbmapx bmx; - struct getbmap bm; - int iflags; - int error; - - if (copy_from_user(&bmx, arg, sizeof(bmx))) - return -XFS_ERROR(EFAULT); - - if (bmx.bmv_count < 2) - return -XFS_ERROR(EINVAL); - - /* - * Map input getbmapx structure to a getbmap - * structure for xfs_getbmap. - */ - GETBMAP_CONVERT(bmx, bm); - - iflags = bmx.bmv_iflags; - - if (iflags & (~BMV_IF_VALID)) - return -XFS_ERROR(EINVAL); - - iflags |= BMV_IF_EXTENDED; - - error = xfs_getbmap(bdp, &bm, (struct getbmapx __user *)arg+1, iflags); - if (error) - return -error; - - GETBMAP_CONVERT(bm, bmx); - - if (copy_to_user(arg, &bmx, sizeof(bmx))) - return -XFS_ERROR(EFAULT); - - return 0; -} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c deleted file mode 100644 index 270db0f3861..00000000000 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2004-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <linux/compat.h> -#include <linux/init.h> -#include <linux/ioctl.h> -#include <linux/syscalls.h> -#include <linux/types.h> -#include <linux/fs.h> -#include <asm/uaccess.h> -#include "xfs.h" -#include "xfs_types.h" -#include "xfs_fs.h" -#include "xfs_vfs.h" -#include "xfs_vnode.h" -#include "xfs_dfrag.h" - -#define _NATIVE_IOC(cmd, type) \ - _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) - -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) -#define BROKEN_X86_ALIGNMENT -/* on ia32 l_start is on a 32-bit boundary */ -typedef struct xfs_flock64_32 { - __s16 l_type; - __s16 l_whence; - __s64 l_start __attribute__((packed)); - /* len == 0 means until end of file */ - __s64 l_len __attribute__((packed)); - __s32 l_sysid; - __u32 l_pid; - __s32 l_pad[4]; /* reserve area */ -} xfs_flock64_32_t; - -#define XFS_IOC_ALLOCSP_32 _IOW ('X', 10, struct xfs_flock64_32) -#define XFS_IOC_FREESP_32 _IOW ('X', 11, struct xfs_flock64_32) -#define XFS_IOC_ALLOCSP64_32 _IOW ('X', 36, struct xfs_flock64_32) -#define XFS_IOC_FREESP64_32 _IOW ('X', 37, struct xfs_flock64_32) -#define XFS_IOC_RESVSP_32 _IOW ('X', 40, struct xfs_flock64_32) -#define XFS_IOC_UNRESVSP_32 _IOW ('X', 41, struct xfs_flock64_32) -#define XFS_IOC_RESVSP64_32 _IOW ('X', 42, struct xfs_flock64_32) -#define XFS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct xfs_flock64_32) - -/* just account for different alignment */ -STATIC unsigned long -xfs_ioctl32_flock( - unsigned long arg) -{ - xfs_flock64_32_t __user *p32 = (void __user *)arg; - xfs_flock64_t __user *p = compat_alloc_user_space(sizeof(*p)); - - if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) || - copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) || - copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) || - copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) || - copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) || - copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || - copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) - return -EFAULT; - - return (unsigned long)p; -} - -#else - -typedef struct xfs_fsop_bulkreq32 { - compat_uptr_t lastip; /* last inode # pointer */ - __s32 icount; /* count of entries in buffer */ - compat_uptr_t ubuffer; /* user buffer for inode desc. */ - __s32 ocount; /* output count pointer */ -} xfs_fsop_bulkreq32_t; - -STATIC unsigned long -xfs_ioctl32_bulkstat( - unsigned long arg) -{ - xfs_fsop_bulkreq32_t __user *p32 = (void __user *)arg; - xfs_fsop_bulkreq_t __user *p = compat_alloc_user_space(sizeof(*p)); - u32 addr; - - if (get_user(addr, &p32->lastip) || - put_user(compat_ptr(addr), &p->lastip) || - copy_in_user(&p->icount, &p32->icount, sizeof(s32)) || - get_user(addr, &p32->ubuffer) || - put_user(compat_ptr(addr), &p->ubuffer) || - get_user(addr, &p32->ocount) || - put_user(compat_ptr(addr), &p->ocount)) - return -EFAULT; - - return (unsigned long)p; -} -#endif - -STATIC long -xfs_compat_ioctl( - int mode, - struct file *file, - unsigned cmd, - unsigned long arg) -{ - struct inode *inode = file->f_dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - int error; - - switch (cmd) { - case XFS_IOC_DIOINFO: - case XFS_IOC_FSGEOMETRY_V1: - case XFS_IOC_FSGEOMETRY: - case XFS_IOC_GETVERSION: - case XFS_IOC_GETXFLAGS: - case XFS_IOC_SETXFLAGS: - case XFS_IOC_FSGETXATTR: - case XFS_IOC_FSSETXATTR: - case XFS_IOC_FSGETXATTRA: - case XFS_IOC_FSSETDM: - case XFS_IOC_GETBMAP: - case XFS_IOC_GETBMAPA: - case XFS_IOC_GETBMAPX: -/* not handled - case XFS_IOC_FD_TO_HANDLE: - case XFS_IOC_PATH_TO_HANDLE: - case XFS_IOC_PATH_TO_FSHANDLE: - case XFS_IOC_OPEN_BY_HANDLE: - case XFS_IOC_FSSETDM_BY_HANDLE: - case XFS_IOC_READLINK_BY_HANDLE: - case XFS_IOC_ATTRLIST_BY_HANDLE: - case XFS_IOC_ATTRMULTI_BY_HANDLE: -*/ - case XFS_IOC_FSCOUNTS: - case XFS_IOC_SET_RESBLKS: - case XFS_IOC_GET_RESBLKS: - case XFS_IOC_FSGROWFSDATA: - case XFS_IOC_FSGROWFSLOG: - case XFS_IOC_FSGROWFSRT: - case XFS_IOC_FREEZE: - case XFS_IOC_THAW: - case XFS_IOC_GOINGDOWN: - case XFS_IOC_ERROR_INJECTION: - case XFS_IOC_ERROR_CLEARALL: - break; - -#ifdef BROKEN_X86_ALIGNMENT - /* xfs_flock_t has wrong u32 vs u64 alignment */ - case XFS_IOC_ALLOCSP_32: - case XFS_IOC_FREESP_32: - case XFS_IOC_ALLOCSP64_32: - case XFS_IOC_FREESP64_32: - case XFS_IOC_RESVSP_32: - case XFS_IOC_UNRESVSP_32: - case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: - arg = xfs_ioctl32_flock(arg); - cmd = _NATIVE_IOC(cmd, struct xfs_flock64); - break; - -#else /* These are handled fine if no alignment issues */ - case XFS_IOC_ALLOCSP: - case XFS_IOC_FREESP: - case XFS_IOC_RESVSP: - case XFS_IOC_UNRESVSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP64: - case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: - break; - - /* xfs_bstat_t still has wrong u32 vs u64 alignment */ - case XFS_IOC_SWAPEXT: - break; - - case XFS_IOC_FSBULKSTAT_SINGLE: - case XFS_IOC_FSBULKSTAT: - case XFS_IOC_FSINUMBERS: - arg = xfs_ioctl32_bulkstat(arg); - break; -#endif - default: - return -ENOIOCTLCMD; - } - - error = bhv_vop_ioctl(vp, inode, file, mode, cmd, (void __user *)arg); - VMODIFY(vp); - - return error; -} - -long -xfs_file_compat_ioctl( - struct file *file, - unsigned cmd, - unsigned long arg) -{ - return xfs_compat_ioctl(0, file, cmd, arg); -} - -long -xfs_file_compat_invis_ioctl( - struct file *file, - unsigned cmd, - unsigned long arg) -{ - return xfs_compat_ioctl(IO_INVIS, file, cmd, arg); -} diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c deleted file mode 100644 index 22e3b714f62..00000000000 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ /dev/null @@ -1,844 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" - -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/namei.h> -#include <linux/security.h> - -/* - * Get a XFS inode from a given vnode. - */ -xfs_inode_t * -xfs_vtoi( - bhv_vnode_t *vp) -{ - bhv_desc_t *bdp; - - bdp = bhv_lookup_range(VN_BHV_HEAD(vp), - VNODE_POSITION_XFS, VNODE_POSITION_XFS); - if (unlikely(bdp == NULL)) - return NULL; - return XFS_BHVTOI(bdp); -} - -/* - * Bring the atime in the XFS inode uptodate. - * Used before logging the inode to disk or when the Linux inode goes away. - */ -void -xfs_synchronize_atime( - xfs_inode_t *ip) -{ - bhv_vnode_t *vp; - - vp = XFS_ITOV_NULL(ip); - if (vp) { - struct inode *inode = &vp->v_inode; - ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec; - } -} - -/* - * Change the requested timestamp in the given inode. - * We don't lock across timestamp updates, and we don't log them but - * we do record the fact that there is dirty information in core. - * - * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG - * with XFS_ICHGTIME_ACC to be sure that access time - * update will take. Calling first with XFS_ICHGTIME_ACC - * and then XFS_ICHGTIME_MOD may fail to modify the access - * timestamp if the filesystem is mounted noacctm. - */ -void -xfs_ichgtime( - xfs_inode_t *ip, - int flags) -{ - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - timespec_t tv; - - nanotime(&tv); - if (flags & XFS_ICHGTIME_MOD) { - inode->i_mtime = tv; - ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; - } - if (flags & XFS_ICHGTIME_ACC) { - inode->i_atime = tv; - ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { - inode->i_ctime = tv; - ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec; - } - - /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. - */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_LOCK)) - mark_inode_dirty_sync(inode); -} - -/* - * Variant on the above which avoids querying the system clock - * in situations where we know the Linux inode timestamps have - * just been updated (and so we can update our inode cheaply). - */ -void -xfs_ichgtime_fast( - xfs_inode_t *ip, - struct inode *inode, - int flags) -{ - timespec_t *tvp; - - /* - * Atime updates for read() & friends are handled lazily now, and - * explicit updates must go through xfs_ichgtime() - */ - ASSERT((flags & XFS_ICHGTIME_ACC) == 0); - - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (unlikely(IS_RDONLY(inode))) - return; - - if (flags & XFS_ICHGTIME_MOD) { - tvp = &inode->i_mtime; - ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec; - } - if (flags & XFS_ICHGTIME_CHG) { - tvp = &inode->i_ctime; - ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec; - ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec; - } - - /* - * We update the i_update_core field _after_ changing - * the timestamps in order to coordinate properly with - * xfs_iflush() so that we don't lose timestamp updates. - * This keeps us from having to hold the inode lock - * while doing this. We use the SYNCHRONIZE macro to - * ensure that the compiler does not reorder the update - * of i_update_core above the timestamp updates above. - */ - SYNCHRONIZE(); - ip->i_update_core = 1; - if (!(inode->i_state & I_LOCK)) - mark_inode_dirty_sync(inode); -} - - -/* - * Pull the link count and size up from the xfs inode to the linux inode - */ -STATIC void -xfs_validate_fields( - struct inode *ip, - bhv_vattr_t *vattr) -{ - vattr->va_mask = XFS_AT_NLINK|XFS_AT_SIZE|XFS_AT_NBLOCKS; - if (!bhv_vop_getattr(vn_from_inode(ip), vattr, ATTR_LAZY, NULL)) { - ip->i_nlink = vattr->va_nlink; - ip->i_blocks = vattr->va_nblocks; - - /* we're under i_sem so i_size can't change under us */ - if (i_size_read(ip) != vattr->va_size) - i_size_write(ip, vattr->va_size); - } -} - -/* - * Hook in SELinux. This is not quite correct yet, what we really need - * here (as we do for default ACLs) is a mechanism by which creation of - * these attrs can be journalled at inode creation time (along with the - * inode, of course, such that log replay can't cause these to be lost). - */ -STATIC int -xfs_init_security( - bhv_vnode_t *vp, - struct inode *dir) -{ - struct inode *ip = vn_to_inode(vp); - size_t length; - void *value; - char *name; - int error; - - error = security_inode_init_security(ip, dir, &name, &value, &length); - if (error) { - if (error == -EOPNOTSUPP) - return 0; - return -error; - } - - error = bhv_vop_attr_set(vp, name, value, length, ATTR_SECURE, NULL); - if (!error) - VMODIFY(vp); - - kfree(name); - kfree(value); - return error; -} - -/* - * Determine whether a process has a valid fs_struct (kernel daemons - * like knfsd don't have an fs_struct). - * - * XXX(hch): nfsd is broken, better fix it instead. - */ -STATIC inline int -xfs_has_fs_struct(struct task_struct *task) -{ - return (task->fs != init_task.fs); -} - -STATIC inline void -xfs_cleanup_inode( - bhv_vnode_t *dvp, - bhv_vnode_t *vp, - struct dentry *dentry, - int mode) -{ - struct dentry teardown = {}; - - /* Oh, the horror. - * If we can't add the ACL or we fail in - * xfs_init_security we must back out. - * ENOSPC can hit here, among other things. - */ - teardown.d_inode = vn_to_inode(vp); - teardown.d_name = dentry->d_name; - - if (S_ISDIR(mode)) - bhv_vop_rmdir(dvp, &teardown, NULL); - else - bhv_vop_remove(dvp, &teardown, NULL); - VN_RELE(vp); -} - -STATIC int -xfs_vn_mknod( - struct inode *dir, - struct dentry *dentry, - int mode, - dev_t rdev) -{ - struct inode *ip; - bhv_vattr_t vattr = { 0 }; - bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir); - xfs_acl_t *default_acl = NULL; - attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; - int error; - - /* - * Irix uses Missed'em'V split, but doesn't want to see - * the upper 5 bits of (14bit) major. - */ - if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) - return -EINVAL; - - if (unlikely(test_default_acl && test_default_acl(dvp))) { - if (!_ACL_ALLOC(default_acl)) { - return -ENOMEM; - } - if (!_ACL_GET_DEFAULT(dvp, default_acl)) { - _ACL_FREE(default_acl); - default_acl = NULL; - } - } - - if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current)) - mode &= ~current->fs->umask; - - vattr.va_mask = XFS_AT_TYPE|XFS_AT_MODE; - vattr.va_mode = mode; - - switch (mode & S_IFMT) { - case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: - vattr.va_rdev = sysv_encode_dev(rdev); - vattr.va_mask |= XFS_AT_RDEV; - /*FALLTHROUGH*/ - case S_IFREG: - error = bhv_vop_create(dvp, dentry, &vattr, &vp, NULL); - break; - case S_IFDIR: - error = bhv_vop_mkdir(dvp, dentry, &vattr, &vp, NULL); - break; - default: - error = EINVAL; - break; - } - - if (unlikely(!error)) { - error = xfs_init_security(vp, dir); - if (error) - xfs_cleanup_inode(dvp, vp, dentry, mode); - } - - if (unlikely(default_acl)) { - if (!error) { - error = _ACL_INHERIT(vp, &vattr, default_acl); - if (!error) - VMODIFY(vp); - else - xfs_cleanup_inode(dvp, vp, dentry, mode); - } - _ACL_FREE(default_acl); - } - - if (likely(!error)) { - ASSERT(vp); - ip = vn_to_inode(vp); - - if (S_ISCHR(mode) || S_ISBLK(mode)) - ip->i_rdev = rdev; - else if (S_ISDIR(mode)) - xfs_validate_fields(ip, &vattr); - d_instantiate(dentry, ip); - xfs_validate_fields(dir, &vattr); - } - return -error; -} - -STATIC int -xfs_vn_create( - struct inode *dir, - struct dentry *dentry, - int mode, - struct nameidata *nd) -{ - return xfs_vn_mknod(dir, dentry, mode, 0); -} - -STATIC int -xfs_vn_mkdir( - struct inode *dir, - struct dentry *dentry, - int mode) -{ - return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); -} - -STATIC struct dentry * -xfs_vn_lookup( - struct inode *dir, - struct dentry *dentry, - struct nameidata *nd) -{ - bhv_vnode_t *vp = vn_from_inode(dir), *cvp; - int error; - - if (dentry->d_name.len >= MAXNAMELEN) - return ERR_PTR(-ENAMETOOLONG); - - error = bhv_vop_lookup(vp, dentry, &cvp, 0, NULL, NULL); - if (unlikely(error)) { - if (unlikely(error != ENOENT)) - return ERR_PTR(-error); - d_add(dentry, NULL); - return NULL; - } - - return d_splice_alias(vn_to_inode(cvp), dentry); -} - -STATIC int -xfs_vn_link( - struct dentry *old_dentry, - struct inode *dir, - struct dentry *dentry) -{ - struct inode *ip; /* inode of guy being linked to */ - bhv_vnode_t *tdvp; /* target directory for new name/link */ - bhv_vnode_t *vp; /* vp of name being linked */ - bhv_vattr_t vattr; - int error; - - ip = old_dentry->d_inode; /* inode being linked to */ - tdvp = vn_from_inode(dir); - vp = vn_from_inode(ip); - - VN_HOLD(vp); - error = bhv_vop_link(tdvp, vp, dentry, NULL); - if (unlikely(error)) { - VN_RELE(vp); - } else { - VMODIFY(tdvp); - xfs_validate_fields(ip, &vattr); - d_instantiate(dentry, ip); - } - return -error; -} - -STATIC int -xfs_vn_unlink( - struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode; - bhv_vnode_t *dvp; /* directory containing name to remove */ - bhv_vattr_t vattr; - int error; - - inode = dentry->d_inode; - dvp = vn_from_inode(dir); - - error = bhv_vop_remove(dvp, dentry, NULL); - if (likely(!error)) { - xfs_validate_fields(dir, &vattr); /* size needs update */ - xfs_validate_fields(inode, &vattr); - } - return -error; -} - -STATIC int -xfs_vn_symlink( - struct inode *dir, - struct dentry *dentry, - const char *symname) -{ - struct inode *ip; - bhv_vattr_t va = { 0 }; - bhv_vnode_t *dvp; /* directory containing name of symlink */ - bhv_vnode_t *cvp; /* used to lookup symlink to put in dentry */ - int error; - - dvp = vn_from_inode(dir); - cvp = NULL; - - va.va_mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); - va.va_mask = XFS_AT_TYPE|XFS_AT_MODE; - - error = bhv_vop_symlink(dvp, dentry, &va, (char *)symname, &cvp, NULL); - if (likely(!error && cvp)) { - error = xfs_init_security(cvp, dir); - if (likely(!error)) { - ip = vn_to_inode(cvp); - d_instantiate(dentry, ip); - xfs_validate_fields(dir, &va); - xfs_validate_fields(ip, &va); - } else { - xfs_cleanup_inode(dvp, cvp, dentry, 0); - } - } - return -error; -} - -STATIC int -xfs_vn_rmdir( - struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - bhv_vnode_t *dvp = vn_from_inode(dir); - bhv_vattr_t vattr; - int error; - - error = bhv_vop_rmdir(dvp, dentry, NULL); - if (likely(!error)) { - xfs_validate_fields(inode, &vattr); - xfs_validate_fields(dir, &vattr); - } - return -error; -} - -STATIC int -xfs_vn_rename( - struct inode *odir, - struct dentry *odentry, - struct inode *ndir, - struct dentry *ndentry) -{ - struct inode *new_inode = ndentry->d_inode; - bhv_vnode_t *fvp; /* from directory */ - bhv_vnode_t *tvp; /* target directory */ - bhv_vattr_t vattr; - int error; - - fvp = vn_from_inode(odir); - tvp = vn_from_inode(ndir); - - error = bhv_vop_rename(fvp, odentry, tvp, ndentry, NULL); - if (likely(!error)) { - if (new_inode) - xfs_validate_fields(new_inode, &vattr); - xfs_validate_fields(odir, &vattr); - if (ndir != odir) - xfs_validate_fields(ndir, &vattr); - } - return -error; -} - -/* - * careful here - this function can get called recursively, so - * we need to be very careful about how much stack we use. - * uio is kmalloced for this reason... - */ -STATIC void * -xfs_vn_follow_link( - struct dentry *dentry, - struct nameidata *nd) -{ - bhv_vnode_t *vp; - uio_t *uio; - iovec_t iov; - int error; - char *link; - - ASSERT(dentry); - ASSERT(nd); - - link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); - if (!link) { - nd_set_link(nd, ERR_PTR(-ENOMEM)); - return NULL; - } - - uio = kmalloc(sizeof(uio_t), GFP_KERNEL); - if (!uio) { - kfree(link); - nd_set_link(nd, ERR_PTR(-ENOMEM)); - return NULL; - } - - vp = vn_from_inode(dentry->d_inode); - - iov.iov_base = link; - iov.iov_len = MAXPATHLEN; - - uio->uio_iov = &iov; - uio->uio_offset = 0; - uio->uio_segflg = UIO_SYSSPACE; - uio->uio_resid = MAXPATHLEN; - uio->uio_iovcnt = 1; - - error = bhv_vop_readlink(vp, uio, 0, NULL); - if (unlikely(error)) { - kfree(link); - link = ERR_PTR(-error); - } else { - link[MAXPATHLEN - uio->uio_resid] = '\0'; - } - kfree(uio); - - nd_set_link(nd, link); - return NULL; -} - -STATIC void -xfs_vn_put_link( - struct dentry *dentry, - struct nameidata *nd, - void *p) -{ - char *s = nd_get_link(nd); - - if (!IS_ERR(s)) - kfree(s); -} - -#ifdef CONFIG_XFS_POSIX_ACL -STATIC int -xfs_vn_permission( - struct inode *inode, - int mode, - struct nameidata *nd) -{ - return -bhv_vop_access(vn_from_inode(inode), mode << 6, NULL); -} -#else -#define xfs_vn_permission NULL -#endif - -STATIC int -xfs_vn_getattr( - struct vfsmount *mnt, - struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - int error = 0; - - if (unlikely(vp->v_flag & VMODIFIED)) - error = vn_revalidate(vp); - if (!error) - generic_fillattr(inode, stat); - return -error; -} - -STATIC int -xfs_vn_setattr( - struct dentry *dentry, - struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - unsigned int ia_valid = attr->ia_valid; - bhv_vnode_t *vp = vn_from_inode(inode); - bhv_vattr_t vattr = { 0 }; - int flags = 0; - int error; - - if (ia_valid & ATTR_UID) { - vattr.va_mask |= XFS_AT_UID; - vattr.va_uid = attr->ia_uid; - } - if (ia_valid & ATTR_GID) { - vattr.va_mask |= XFS_AT_GID; - vattr.va_gid = attr->ia_gid; - } - if (ia_valid & ATTR_SIZE) { - vattr.va_mask |= XFS_AT_SIZE; - vattr.va_size = attr->ia_size; - } - if (ia_valid & ATTR_ATIME) { - vattr.va_mask |= XFS_AT_ATIME; - vattr.va_atime = attr->ia_atime; - inode->i_atime = attr->ia_atime; - } - if (ia_valid & ATTR_MTIME) { - vattr.va_mask |= XFS_AT_MTIME; - vattr.va_mtime = attr->ia_mtime; - } - if (ia_valid & ATTR_CTIME) { - vattr.va_mask |= XFS_AT_CTIME; - vattr.va_ctime = attr->ia_ctime; - } - if (ia_valid & ATTR_MODE) { - vattr.va_mask |= XFS_AT_MODE; - vattr.va_mode = attr->ia_mode; - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - inode->i_mode &= ~S_ISGID; - } - - if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) - flags |= ATTR_UTIME; -#ifdef ATTR_NO_BLOCK - if ((ia_valid & ATTR_NO_BLOCK)) - flags |= ATTR_NONBLOCK; -#endif - - error = bhv_vop_setattr(vp, &vattr, flags, NULL); - if (likely(!error)) - __vn_revalidate(vp, &vattr); - return -error; -} - -STATIC void -xfs_vn_truncate( - struct inode *inode) -{ - block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks); -} - -STATIC int -xfs_vn_setxattr( - struct dentry *dentry, - const char *name, - const void *data, - size_t size, - int flags) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (flags & XATTR_CREATE) - xflags |= ATTR_CREATE; - if (flags & XATTR_REPLACE) - xflags |= ATTR_REPLACE; - xflags |= namesp->attr_flag; - return namesp->attr_set(vp, attr, (void *)data, size, xflags); -} - -STATIC ssize_t -xfs_vn_getxattr( - struct dentry *dentry, - const char *name, - void *data, - size_t size) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - ssize_t error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - - /* Convert Linux syscall to XFS internal ATTR flags */ - if (!size) { - xflags |= ATTR_KERNOVAL; - data = NULL; - } - xflags |= namesp->attr_flag; - return namesp->attr_get(vp, attr, (void *)data, size, xflags); -} - -STATIC ssize_t -xfs_vn_listxattr( - struct dentry *dentry, - char *data, - size_t size) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - int error, xflags = ATTR_KERNAMELS; - ssize_t result; - - if (!size) - xflags |= ATTR_KERNOVAL; - xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS; - - error = attr_generic_list(vp, data, size, xflags, &result); - if (error < 0) - return error; - return result; -} - -STATIC int -xfs_vn_removexattr( - struct dentry *dentry, - const char *name) -{ - bhv_vnode_t *vp = vn_from_inode(dentry->d_inode); - char *attr = (char *)name; - attrnames_t *namesp; - int xflags = 0; - int error; - - namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT); - if (!namesp) - return -EOPNOTSUPP; - attr += namesp->attr_namelen; - error = namesp->attr_capable(vp, NULL); - if (error) - return error; - xflags |= namesp->attr_flag; - return namesp->attr_remove(vp, attr, xflags); -} - - -struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, - .truncate = xfs_vn_truncate, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, - .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, -}; - -struct inode_operations xfs_dir_inode_operations = { - .create = xfs_vn_create, - .lookup = xfs_vn_lookup, - .link = xfs_vn_link, - .unlink = xfs_vn_unlink, - .symlink = xfs_vn_symlink, - .mkdir = xfs_vn_mkdir, - .rmdir = xfs_vn_rmdir, - .mknod = xfs_vn_mknod, - .rename = xfs_vn_rename, - .permission = xfs_vn_permission, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, - .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, -}; - -struct inode_operations xfs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = xfs_vn_follow_link, - .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, - .getattr = xfs_vn_getattr, - .setattr = xfs_vn_setattr, - .setxattr = xfs_vn_setxattr, - .getxattr = xfs_vn_getxattr, - .listxattr = xfs_vn_listxattr, - .removexattr = xfs_vn_removexattr, -}; diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c deleted file mode 100644 index ee788b1cb36..00000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_inode_item.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_iomap.h" - -#include <linux/capability.h> -#include <linux/writeback.h> - - -#if defined(XFS_RW_TRACE) -void -xfs_rw_enter_trace( - int tag, - xfs_iocore_t *io, - void *data, - size_t segs, - loff_t offset, - int ioflags) -{ - xfs_inode_t *ip = XFS_IO_INODE(io); - - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(unsigned long)tag, - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)data, - (void *)((unsigned long)segs), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)ioflags), - (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(io->io_new_size & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} - -void -xfs_inval_cached_trace( - xfs_iocore_t *io, - xfs_off_t offset, - xfs_off_t len, - xfs_off_t first, - xfs_off_t last) -{ - xfs_inode_t *ip = XFS_IO_INODE(io); - - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(__psint_t)XFS_INVAL_CACHED, - (void *)ip, - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)((len >> 32) & 0xffffffff)), - (void *)((unsigned long)(len & 0xffffffff)), - (void *)((unsigned long)((first >> 32) & 0xffffffff)), - (void *)((unsigned long)(first & 0xffffffff)), - (void *)((unsigned long)((last >> 32) & 0xffffffff)), - (void *)((unsigned long)(last & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} -#endif - -/* - * xfs_iozero - * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. - */ -STATIC int -xfs_iozero( - struct inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count, /* size of data to zero */ - loff_t end_size) /* max file size to set */ -{ - unsigned bytes; - struct page *page; - struct address_space *mapping; - char *kaddr; - int status; - - mapping = ip->i_mapping; - do { - unsigned long index, offset; - - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - if (bytes > count) - bytes = count; - - status = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) - break; - - kaddr = kmap(page); - status = mapping->a_ops->prepare_write(NULL, page, offset, - offset + bytes); - if (status) { - goto unlock; - } - - memset((void *) (kaddr + offset), 0, bytes); - flush_dcache_page(page); - status = mapping->a_ops->commit_write(NULL, page, offset, - offset + bytes); - if (!status) { - pos += bytes; - count -= bytes; - if (pos > i_size_read(ip)) - i_size_write(ip, pos < end_size ? pos : end_size); - } - -unlock: - kunmap(page); - unlock_page(page); - page_cache_release(page); - if (status) - break; - } while (count); - - return (-status); -} - -ssize_t /* bytes read, or (-) error */ -xfs_read( - bhv_desc_t *bdp, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int segs, - loff_t *offset, - int ioflags, - cred_t *credp) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - size_t size = 0; - ssize_t ret; - xfs_fsize_t n; - xfs_inode_t *ip; - xfs_mount_t *mp; - bhv_vnode_t *vp; - unsigned long seg; - - ip = XFS_BHVTOI(bdp); - vp = BHV_TO_VNODE(bdp); - mp = ip->i_mount; - - XFS_STATS_INC(xs_read_calls); - - /* START copy & waste from filemap.c */ - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - size += iv->iov_len; - if (unlikely((ssize_t)(size|iv->iov_len) < 0)) - return XFS_ERROR(-EINVAL); - } - /* END copy & waste from filemap.c */ - - if (unlikely(ioflags & IO_ISDIRECT)) { - xfs_buftarg_t *target = - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - if ((*offset & target->bt_smask) || - (size & target->bt_smask)) { - if (*offset == ip->i_d.di_size) { - return (0); - } - return -XFS_ERROR(EINVAL); - } - } - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (size == 0)) - return 0; - - if (n < size) - size = n; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_lock(&inode->i_mutex); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && - !(ioflags & IO_INVIS)) { - bhv_vrwlock_t locktype = VRWLOCK_READ; - int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); - - ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, - BHV_TO_VNODE(bdp), *offset, size, - dmflags, &locktype); - if (ret) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - return ret; - } - } - - if (unlikely((ioflags & IO_ISDIRECT) && VN_CACHED(vp))) - bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)), - -1, FI_REMAPF_LOCKED); - - if (unlikely(ioflags & IO_ISDIRECT)) - mutex_unlock(&inode->i_mutex); - - xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore, - (void *)iovp, segs, *offset, ioflags); - ret = __generic_file_aio_read(iocb, iovp, segs, offset); - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_sendfile( - bhv_desc_t *bdp, - struct file *filp, - loff_t *offset, - int ioflags, - size_t count, - read_actor_t actor, - void *target, - cred_t *credp) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_READ; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - *offset, count, - FILP_DELAY_FLAG(filp), &locktype); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, - (void *)(unsigned long)target, count, *offset, ioflags); - ret = generic_file_sendfile(filp, offset, count, actor, target); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_read( - bhv_desc_t *bdp, - struct file *infilp, - loff_t *ppos, - struct pipe_inode_info *pipe, - size_t count, - int flags, - int ioflags, - cred_t *credp) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - - XFS_STATS_INC(xs_read_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_READ; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - *ppos, count, - FILP_DELAY_FLAG(infilp), &locktype); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return -error; - } - } - xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, - pipe, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); - - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - return ret; -} - -ssize_t -xfs_splice_write( - bhv_desc_t *bdp, - struct pipe_inode_info *pipe, - struct file *outfilp, - loff_t *ppos, - size_t count, - int flags, - int ioflags, - cred_t *credp) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_mount_t *mp = ip->i_mount; - ssize_t ret; - struct inode *inode = outfilp->f_mapping->host; - xfs_fsize_t isize; - - XFS_STATS_INC(xs_write_calls); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) && - (!(ioflags & IO_INVIS))) { - bhv_vrwlock_t locktype = VRWLOCK_WRITE; - int error; - - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), - *ppos, count, - FILP_DELAY_FLAG(outfilp), &locktype); - if (error) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return -error; - } - } - xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, - pipe, count, *ppos, ioflags); - ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); - if (ret > 0) - XFS_STATS_ADD(xs_write_bytes, ret); - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) - *ppos = isize; - - if (*ppos > ip->i_d.di_size) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (*ppos > ip->i_d.di_size) { - ip->i_d.di_size = *ppos; - i_size_write(inode, *ppos); - ip->i_update_core = 1; - ip->i_update_size = 1; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return ret; -} - -/* - * This routine is called to handle zeroing any space in the last - * block of the file that is beyond the EOF. We do this since the - * size is being increased without writing anything to that block - * and we don't want anyone to read the garbage on the disk. - */ -STATIC int /* error (positive) */ -xfs_zero_last_block( - struct inode *ip, - xfs_iocore_t *io, - xfs_fsize_t isize, - xfs_fsize_t end_size) -{ - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = io->io_mount; - int nimaps; - int zero_offset; - int zero_len; - int error = 0; - xfs_bmbt_irec_t imap; - loff_t loff; - - ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); - - zero_offset = XFS_B_FSB_OFFSET(mp, isize); - if (zero_offset == 0) { - /* - * There are no extra bytes in the last block on disk to - * zero, so return. - */ - return 0; - } - - last_fsb = XFS_B_TO_FSBT(mp, isize); - nimaps = 1; - error = XFS_BMAPI(mp, NULL, io, last_fsb, 1, 0, NULL, 0, &imap, - &nimaps, NULL, NULL); - if (error) { - return error; - } - ASSERT(nimaps > 0); - /* - * If the block underlying isize is just a hole, then there - * is nothing to zero. - */ - if (imap.br_startblock == HOLESTARTBLOCK) { - return 0; - } - /* - * Zero the part of the last block beyond the EOF, and write it - * out sync. We need to drop the ilock while we do this so we - * don't deadlock when the buffer cache calls back to us. - */ - XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); - - loff = XFS_FSB_TO_B(mp, last_fsb); - zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); - - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - ASSERT(error >= 0); - return error; -} - -/* - * Zero any on disk space between the current EOF and the new, - * larger EOF. This handles the normal case of zeroing the remainder - * of the last block in the file and the unusual case of zeroing blocks - * out beyond the size of the file. This second case only happens - * with fixed size extents and when the system crashes before the inode - * size was updated but after blocks were allocated. If fill is set, - * then any holes in the range are filled and zeroed. If not, the holes - * are left alone as holes. - */ - -int /* error (positive) */ -xfs_zero_eof( - bhv_vnode_t *vp, - xfs_iocore_t *io, - xfs_off_t offset, /* starting I/O offset */ - xfs_fsize_t isize, /* current inode size */ - xfs_fsize_t end_size) /* terminal inode size */ -{ - struct inode *ip = vn_to_inode(vp); - xfs_fileoff_t start_zero_fsb; - xfs_fileoff_t end_zero_fsb; - xfs_fileoff_t zero_count_fsb; - xfs_fileoff_t last_fsb; - xfs_mount_t *mp = io->io_mount; - int nimaps; - int error = 0; - xfs_bmbt_irec_t imap; - - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); - ASSERT(offset > isize); - - /* - * First handle zeroing the block on which isize resides. - * We only zero a part of that block so it is handled specially. - */ - error = xfs_zero_last_block(ip, io, isize, end_size); - if (error) { - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); - return error; - } - - /* - * Calculate the range between the new size and the old - * where blocks needing to be zeroed may exist. To get the - * block where the last byte in the file currently resides, - * we need to subtract one from the size and truncate back - * to a block boundary. We subtract 1 in case the size is - * exactly on a block boundary. - */ - last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; - start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); - end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); - ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); - if (last_fsb == end_zero_fsb) { - /* - * The size was only incremented on its last block. - * We took care of that above, so just return. - */ - return 0; - } - - ASSERT(start_zero_fsb <= end_zero_fsb); - while (start_zero_fsb <= end_zero_fsb) { - nimaps = 1; - zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; - error = XFS_BMAPI(mp, NULL, io, start_zero_fsb, zero_count_fsb, - 0, NULL, 0, &imap, &nimaps, NULL, NULL); - if (error) { - ASSERT(ismrlocked(io->io_lock, MR_UPDATE)); - ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); - return error; - } - ASSERT(nimaps > 0); - - if (imap.br_state == XFS_EXT_UNWRITTEN || - imap.br_startblock == HOLESTARTBLOCK) { - /* - * This loop handles initializing pages that were - * partially initialized by the code below this - * loop. It basically zeroes the part of the page - * that sits on a hole and sets the page as P_HOLE - * and calls remapf if it is a mapped file. - */ - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - continue; - } - - /* - * There are blocks we need to zero. - * Drop the inode lock while we're doing the I/O. - * We'll still have the iolock to protect us. - */ - XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - - error = xfs_iozero(ip, - XFS_FSB_TO_B(mp, start_zero_fsb), - XFS_FSB_TO_B(mp, imap.br_blockcount), - end_size); - if (error) { - goto out_lock; - } - - start_zero_fsb = imap.br_startoff + imap.br_blockcount; - ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - } - - return 0; - -out_lock: - - XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); - ASSERT(error >= 0); - return error; -} - -ssize_t /* bytes written, or (-) error */ -xfs_write( - bhv_desc_t *bdp, - struct kiocb *iocb, - const struct iovec *iovp, - unsigned int nsegs, - loff_t *offset, - int ioflags, - cred_t *credp) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - unsigned long segs = nsegs; - xfs_inode_t *xip; - xfs_mount_t *mp; - ssize_t ret = 0, error = 0; - xfs_fsize_t isize, new_size; - xfs_iocore_t *io; - bhv_vnode_t *vp; - unsigned long seg; - int iolock; - int eventsent = 0; - bhv_vrwlock_t locktype; - size_t ocount = 0, count; - loff_t pos; - int need_i_mutex = 1, need_flush = 0; - - XFS_STATS_INC(xs_write_calls); - - vp = BHV_TO_VNODE(bdp); - xip = XFS_BHVTOI(bdp); - - for (seg = 0; seg < segs; seg++) { - const struct iovec *iv = &iovp[seg]; - - /* - * If any segment has a negative length, or the cumulative - * length ever wraps negative then return -EINVAL. - */ - ocount += iv->iov_len; - if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) - return -EINVAL; - if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) - continue; - if (seg == 0) - return -EFAULT; - segs = seg; - ocount -= iv->iov_len; /* This segment is no good */ - break; - } - - count = ocount; - pos = *offset; - - if (count == 0) - return 0; - - io = &xip->i_iocore; - mp = io->io_mount; - - vfs_wait_for_freeze(vp->v_vfsp, SB_FREEZE_WRITE); - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - if (ioflags & IO_ISDIRECT) { - xfs_buftarg_t *target = - (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? - mp->m_rtdev_targp : mp->m_ddev_targp; - - if ((pos & target->bt_smask) || (count & target->bt_smask)) - return XFS_ERROR(-EINVAL); - - if (!VN_CACHED(vp) && pos < i_size_read(inode)) - need_i_mutex = 0; - - if (VN_CACHED(vp)) - need_flush = 1; - } - -relock: - if (need_i_mutex) { - iolock = XFS_IOLOCK_EXCL; - locktype = VRWLOCK_WRITE; - - mutex_lock(&inode->i_mutex); - } else { - iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; - } - - xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); - - isize = i_size_read(inode); - - if (file->f_flags & O_APPEND) - *offset = isize; - -start: - error = -generic_write_checks(file, &pos, &count, - S_ISBLK(inode->i_mode)); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; - } - - new_size = pos + count; - if (new_size > isize) - io->io_new_size = new_size; - - if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && - !(ioflags & IO_INVIS) && !eventsent)) { - loff_t savedsize = pos; - int dmflags = FILP_DELAY_FLAG(file); - - if (need_i_mutex) - dmflags |= DM_FLAGS_IMUX; - - xfs_iunlock(xip, XFS_ILOCK_EXCL); - error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, - pos, count, - dmflags, &locktype); - if (error) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; - } - xfs_ilock(xip, XFS_ILOCK_EXCL); - eventsent = 1; - - /* - * The iolock was dropped and reacquired in XFS_SEND_DATA - * so we have to recheck the size when appending. - * We will only "goto start;" once, since having sent the - * event prevents another call to XFS_SEND_DATA, which is - * what allows the size to change in the first place. - */ - if ((file->f_flags & O_APPEND) && savedsize != isize) { - pos = isize = xip->i_d.di_size; - goto start; - } - } - - if (likely(!(ioflags & IO_INVIS))) { - file_update_time(file); - xfs_ichgtime_fast(xip, inode, - XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - - /* - * If the offset is beyond the size of the file, we have a couple - * of things to do. First, if there is already space allocated - * we need to either create holes or zero the disk or ... - * - * If there is a page where the previous size lands, we need - * to zero it out up to the new size. - */ - - if (pos > isize) { - error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, pos, - isize, pos + count); - if (error) { - xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); - goto out_unlock_mutex; - } - } - xfs_iunlock(xip, XFS_ILOCK_EXCL); - - /* - * If we're writing the file then make sure to clear the - * setuid and setgid bits if the process is not being run - * by root. This keeps people from modifying setuid and - * setgid binaries. - */ - - if (((xip->i_d.di_mode & S_ISUID) || - ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == - (S_ISGID | S_IXGRP))) && - !capable(CAP_FSETID)) { - error = xfs_write_clear_setuid(xip); - if (likely(!error)) - error = -remove_suid(file->f_dentry); - if (unlikely(error)) { - xfs_iunlock(xip, iolock); - goto out_unlock_mutex; - } - } - -retry: - /* We can write back this queue in page reclaim */ - current->backing_dev_info = mapping->backing_dev_info; - - if ((ioflags & IO_ISDIRECT)) { - if (need_flush) { - xfs_inval_cached_trace(io, pos, -1, - ctooff(offtoct(pos)), -1); - bhv_vop_flushinval_pages(vp, ctooff(offtoct(pos)), - -1, FI_REMAPF_LOCKED); - } - - if (need_i_mutex) { - /* demote the lock now the cached pages are gone */ - XFS_ILOCK_DEMOTE(mp, io, XFS_IOLOCK_EXCL); - mutex_unlock(&inode->i_mutex); - - iolock = XFS_IOLOCK_SHARED; - locktype = VRWLOCK_WRITE_DIRECT; - need_i_mutex = 0; - } - - xfs_rw_enter_trace(XFS_DIOWR_ENTER, io, (void *)iovp, segs, - *offset, ioflags); - ret = generic_file_direct_write(iocb, iovp, - &segs, pos, offset, count, ocount); - - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. - */ - if (ret >= 0 && ret != count) { - XFS_STATS_ADD(xs_write_bytes, ret); - - pos += ret; - count -= ret; - - need_i_mutex = 1; - ioflags &= ~IO_ISDIRECT; - xfs_iunlock(xip, iolock); - goto relock; - } - } else { - xfs_rw_enter_trace(XFS_WRITE_ENTER, io, (void *)iovp, segs, - *offset, ioflags); - ret = generic_file_buffered_write(iocb, iovp, segs, - pos, offset, count, ret); - } - - current->backing_dev_info = NULL; - - if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) - ret = wait_on_sync_kiocb(iocb); - - if ((ret == -ENOSPC) && - DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && - !(ioflags & IO_INVIS)) { - - xfs_rwunlock(bdp, locktype); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, - DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, - 0, 0, 0); /* Delay flag intentionally unused */ - if (error) - goto out_nounlocks; - if (need_i_mutex) - mutex_lock(&inode->i_mutex); - xfs_rwlock(bdp, locktype); - pos = xip->i_d.di_size; - ret = 0; - goto retry; - } - - isize = i_size_read(inode); - if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) - *offset = isize; - - if (*offset > xip->i_d.di_size) { - xfs_ilock(xip, XFS_ILOCK_EXCL); - if (*offset > xip->i_d.di_size) { - xip->i_d.di_size = *offset; - i_size_write(inode, *offset); - xip->i_update_core = 1; - xip->i_update_size = 1; - } - xfs_iunlock(xip, XFS_ILOCK_EXCL); - } - - error = -ret; - if (ret <= 0) - goto out_unlock_internal; - - XFS_STATS_ADD(xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { - error = xfs_write_sync_logforce(mp, xip); - if (error) - goto out_unlock_internal; - - xfs_rwunlock(bdp, locktype); - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - - error = sync_page_range(inode, mapping, pos, ret); - if (!error) - error = ret; - return error; - } - - out_unlock_internal: - xfs_rwunlock(bdp, locktype); - out_unlock_mutex: - if (need_i_mutex) - mutex_unlock(&inode->i_mutex); - out_nounlocks: - return -error; -} - -/* - * All xfs metadata buffers except log state machine buffers - * get this attached as their b_bdstrat callback function. - * This is so that we can catch a buffer - * after prematurely unpinning it to forcibly shutdown the filesystem. - */ -int -xfs_bdstrat_cb(struct xfs_buf *bp) -{ - xfs_mount_t *mp; - - mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_buf_iorequest(bp); - return 0; - } else { - xfs_buftrace("XFS__BDSTRAT IOERROR", bp); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (XFS_BUF_IODONE_FUNC(bp) == NULL && - (XFS_BUF_ISREAD(bp)) == 0) - return (xfs_bioerror_relse(bp)); - else - return (xfs_bioerror(bp)); - } -} - - -int -xfs_bmap(bhv_desc_t *bdp, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) -{ - xfs_inode_t *ip = XFS_BHVTOI(bdp); - xfs_iocore_t *io = &ip->i_iocore; - - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); - ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == - ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); - - return xfs_iomap(io, offset, count, flags, iomapp, niomaps); -} - -/* - * Wrapper around bdstrat so that we can stop data - * from going to disk in case we are shutting down the filesystem. - * Typically user data goes thru this path; one of the exceptions - * is the superblock. - */ -int -xfsbdstrat( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - ASSERT(mp); - if (!XFS_FORCED_SHUTDOWN(mp)) { - /* Grio redirection would go here - * if (XFS_BUF_IS_GRIO(bp)) { - */ - - xfs_buf_iorequest(bp); - return 0; - } - - xfs_buftrace("XFSBDSTRAT IOERROR", bp); - return (xfs_bioerror_relse(bp)); -} - -/* - * If the underlying (data/log/rt) device is readonly, there are some - * operations that cannot proceed. - */ -int -xfs_dev_is_read_only( - xfs_mount_t *mp, - char *message) -{ - if (xfs_readonly_buftarg(mp->m_ddev_targp) || - xfs_readonly_buftarg(mp->m_logdev_targp) || - (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { - cmn_err(CE_NOTE, - "XFS: %s required on read-only device.", message); - cmn_err(CE_NOTE, - "XFS: write access unavailable, cannot proceed."); - return EROFS; - } - return 0; -} diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h deleted file mode 100644 index c77e62efb74..00000000000 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_LRW_H__ -#define __XFS_LRW_H__ - -struct bhv_desc; -struct bhv_vnode; -struct xfs_mount; -struct xfs_iocore; -struct xfs_inode; -struct xfs_bmbt_irec; -struct xfs_buf; -struct xfs_iomap; - -#if defined(XFS_RW_TRACE) -/* - * Defines for the trace mechanisms in xfs_lrw.c. - */ -#define XFS_RW_KTRACE_SIZE 128 - -#define XFS_READ_ENTER 1 -#define XFS_WRITE_ENTER 2 -#define XFS_IOMAP_READ_ENTER 3 -#define XFS_IOMAP_WRITE_ENTER 4 -#define XFS_IOMAP_READ_MAP 5 -#define XFS_IOMAP_WRITE_MAP 6 -#define XFS_IOMAP_WRITE_NOSPACE 7 -#define XFS_ITRUNC_START 8 -#define XFS_ITRUNC_FINISH1 9 -#define XFS_ITRUNC_FINISH2 10 -#define XFS_CTRUNC1 11 -#define XFS_CTRUNC2 12 -#define XFS_CTRUNC3 13 -#define XFS_CTRUNC4 14 -#define XFS_CTRUNC5 15 -#define XFS_CTRUNC6 16 -#define XFS_BUNMAP 17 -#define XFS_INVAL_CACHED 18 -#define XFS_DIORD_ENTER 19 -#define XFS_DIOWR_ENTER 20 -#define XFS_SENDFILE_ENTER 21 -#define XFS_WRITEPAGE_ENTER 22 -#define XFS_RELEASEPAGE_ENTER 23 -#define XFS_INVALIDPAGE_ENTER 24 -#define XFS_IOMAP_ALLOC_ENTER 25 -#define XFS_IOMAP_ALLOC_MAP 26 -#define XFS_IOMAP_UNWRITTEN 27 -#define XFS_SPLICE_READ_ENTER 28 -#define XFS_SPLICE_WRITE_ENTER 29 -extern void xfs_rw_enter_trace(int, struct xfs_iocore *, - void *, size_t, loff_t, int); -extern void xfs_inval_cached_trace(struct xfs_iocore *, - xfs_off_t, xfs_off_t, xfs_off_t, xfs_off_t); -#else -#define xfs_rw_enter_trace(tag, io, data, size, offset, ioflags) -#define xfs_inval_cached_trace(io, offset, len, first, last) -#endif - -/* - * Maximum count of bmaps used by read and write paths. - */ -#define XFS_MAX_RW_NBMAPS 4 - -extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); -extern int xfs_bdstrat_cb(struct xfs_buf *); -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); - -extern int xfs_zero_eof(struct bhv_vnode *, struct xfs_iocore *, xfs_off_t, - xfs_fsize_t, xfs_fsize_t); -extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); -extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, - struct pipe_inode_info *, size_t, int, int, - struct cred *); -extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, - struct file *, loff_t *, size_t, int, int, - struct cred *); - -#endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c deleted file mode 100644 index e480b610205..00000000000 --- a/fs/xfs/linux-2.6/xfs_stats.c +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include <linux/proc_fs.h> - -DEFINE_PER_CPU(struct xfsstats, xfsstats); - -STATIC int -xfs_read_xfsstats( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) -{ - int c, i, j, len, val; - __uint64_t xs_xstrat_bytes = 0; - __uint64_t xs_write_bytes = 0; - __uint64_t xs_read_bytes = 0; - - static const struct xstats_entry { - char *desc; - int endpoint; - } xstats[] = { - { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, - { "abt", XFSSTAT_END_ALLOC_BTREE }, - { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, - { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, - { "dir", XFSSTAT_END_DIRECTORY_OPS }, - { "trans", XFSSTAT_END_TRANSACTIONS }, - { "ig", XFSSTAT_END_INODE_OPS }, - { "log", XFSSTAT_END_LOG_OPS }, - { "push_ail", XFSSTAT_END_TAIL_PUSHING }, - { "xstrat", XFSSTAT_END_WRITE_CONVERT }, - { "rw", XFSSTAT_END_READ_WRITE_OPS }, - { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, - { "icluster", XFSSTAT_END_INODE_CLUSTER }, - { "vnodes", XFSSTAT_END_VNODE_OPS }, - { "buf", XFSSTAT_END_BUF }, - }; - - /* Loop over all stats groups */ - for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { - len += sprintf(buffer + len, xstats[i].desc); - /* inner loop does each group */ - while (j < xstats[i].endpoint) { - val = 0; - /* sum over all cpus */ - for_each_possible_cpu(c) - val += *(((__u32*)&per_cpu(xfsstats, c) + j)); - len += sprintf(buffer + len, " %u", val); - j++; - } - buffer[len++] = '\n'; - } - /* extra precision counters */ - for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; - xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; - xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; - } - - len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", - xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - len += sprintf(buffer + len, "debug %u\n", -#if defined(DEBUG) - 1); -#else - 0); -#endif - - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; -} - -void -xfs_init_procfs(void) -{ - if (!proc_mkdir("fs/xfs", NULL)) - return; - create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL); -} - -void -xfs_cleanup_procfs(void) -{ - remove_proc_entry("fs/xfs/stat", NULL); - remove_proc_entry("fs/xfs", NULL); -} diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c deleted file mode 100644 index 9df9ed37d21..00000000000 --- a/fs/xfs/linux-2.6/xfs_super.c +++ /dev/null @@ -1,965 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_clnt.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_version.h" - -#include <linux/namei.h> -#include <linux/init.h> -#include <linux/mount.h> -#include <linux/mempool.h> -#include <linux/writeback.h> -#include <linux/kthread.h> - -STATIC struct quotactl_ops xfs_quotactl_operations; -STATIC struct super_operations xfs_super_operations; -STATIC kmem_zone_t *xfs_vnode_zone; -STATIC kmem_zone_t *xfs_ioend_zone; -mempool_t *xfs_ioend_pool; - -STATIC struct xfs_mount_args * -xfs_args_allocate( - struct super_block *sb, - int silent) -{ - struct xfs_mount_args *args; - - args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP); - args->logbufs = args->logbufsize = -1; - strncpy(args->fsname, sb->s_id, MAXNAMELEN); - - /* Copy the already-parsed mount(2) flags we're interested in */ - if (sb->s_flags & MS_DIRSYNC) - args->flags |= XFSMNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) - args->flags |= XFSMNT_WSYNC; - if (silent) - args->flags |= XFSMNT_QUIET; - args->flags |= XFSMNT_32BITINODES; - - return args; -} - -__uint64_t -xfs_max_file_offset( - unsigned int blockshift) -{ - unsigned int pagefactor = 1; - unsigned int bitshift = BITS_PER_LONG - 1; - - /* Figure out maximum filesize, on Linux this can depend on - * the filesystem blocksize (on 32 bit platforms). - * __block_prepare_write does this in an [unsigned] long... - * page->index << (PAGE_CACHE_SHIFT - bbits) - * So, for page sized blocks (4K on 32 bit platforms), - * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is - * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) - * but for smaller blocksizes it is less (bbits = log2 bsize). - * Note1: get_block_t takes a long (implicit cast from above) - * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch - * can optionally convert the [unsigned] long from above into - * an [unsigned] long long. - */ - -#if BITS_PER_LONG == 32 -# if defined(CONFIG_LBD) - ASSERT(sizeof(sector_t) == 8); - pagefactor = PAGE_CACHE_SIZE; - bitshift = BITS_PER_LONG; -# else - pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); -# endif -#endif - - return (((__uint64_t)pagefactor) << bitshift) - 1; -} - -STATIC __inline__ void -xfs_set_inodeops( - struct inode *inode) -{ - switch (inode->i_mode & S_IFMT) { - case S_IFREG: - inode->i_op = &xfs_inode_operations; - inode->i_fop = &xfs_file_operations; - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - case S_IFDIR: - inode->i_op = &xfs_dir_inode_operations; - inode->i_fop = &xfs_dir_file_operations; - break; - case S_IFLNK: - inode->i_op = &xfs_symlink_inode_operations; - if (inode->i_blocks) - inode->i_mapping->a_ops = &xfs_address_space_operations; - break; - default: - inode->i_op = &xfs_inode_operations; - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; - } -} - -STATIC __inline__ void -xfs_revalidate_inode( - xfs_mount_t *mp, - bhv_vnode_t *vp, - xfs_inode_t *ip) -{ - struct inode *inode = vn_to_inode(vp); - - inode->i_mode = ip->i_d.di_mode; - inode->i_nlink = ip->i_d.di_nlink; - inode->i_uid = ip->i_d.di_uid; - inode->i_gid = ip->i_d.di_gid; - - switch (inode->i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - inode->i_rdev = - MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, - sysv_minor(ip->i_df.if_u2.if_rdev)); - break; - default: - inode->i_rdev = 0; - break; - } - - inode->i_generation = ip->i_d.di_gen; - i_size_write(inode, ip->i_d.di_size); - inode->i_blocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; - inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; - inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; - vp->v_flag &= ~VMODIFIED; -} - -void -xfs_initialize_vnode( - bhv_desc_t *bdp, - bhv_vnode_t *vp, - bhv_desc_t *inode_bhv, - int unlock) -{ - xfs_inode_t *ip = XFS_BHVTOI(inode_bhv); - struct inode *inode = vn_to_inode(vp); - - if (!inode_bhv->bd_vobj) { - vp->v_vfsp = bhvtovfs(bdp); - bhv_desc_init(inode_bhv, ip, vp, &xfs_vnodeops); - bhv_insert(VN_BHV_HEAD(vp), inode_bhv); - } - - /* - * We need to set the ops vectors, and unlock the inode, but if - * we have been called during the new inode create process, it is - * too early to fill in the Linux inode. We will get called a - * second time once the inode is properly set up, and then we can - * finish our work. - */ - if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) { - xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip); - xfs_set_inodeops(inode); - - ip->i_flags &= ~XFS_INEW; - barrier(); - - unlock_new_inode(inode); - } -} - -int -xfs_blkdev_get( - xfs_mount_t *mp, - const char *name, - struct block_device **bdevp) -{ - int error = 0; - - *bdevp = open_bdev_excl(name, 0, mp); - if (IS_ERR(*bdevp)) { - error = PTR_ERR(*bdevp); - printk("XFS: Invalid device [%s], error=%d\n", name, error); - } - - return -error; -} - -void -xfs_blkdev_put( - struct block_device *bdev) -{ - if (bdev) - close_bdev_excl(bdev); -} - -/* - * Try to write out the superblock using barriers. - */ -STATIC int -xfs_barrier_test( - xfs_mount_t *mp) -{ - xfs_buf_t *sbp = xfs_getsb(mp, 0); - int error; - - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - XFS_BUF_ORDERED(sbp); - - xfsbdstrat(mp, sbp); - error = xfs_iowait(sbp); - - /* - * Clear all the flags we set and possible error state in the - * buffer. We only did the write to try out whether barriers - * worked and shouldn't leave any traces in the superblock - * buffer. - */ - XFS_BUF_DONE(sbp); - XFS_BUF_ERROR(sbp, 0); - XFS_BUF_UNORDERED(sbp); - - xfs_buf_relse(sbp); - return error; -} - -void -xfs_mountfs_check_barriers(xfs_mount_t *mp) -{ - int error; - - if (mp->m_logdev_targp != mp->m_ddev_targp) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, not supported with external log device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered == - QUEUE_ORDERED_NONE) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, not supported by the underlying device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, underlying device is readonly"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - - error = xfs_barrier_test(mp); - if (error) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, trial barrier write failed"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } -} - -void -xfs_blkdev_issue_flush( - xfs_buftarg_t *buftarg) -{ - blkdev_issue_flush(buftarg->bt_bdev, NULL); -} - -STATIC struct inode * -xfs_fs_alloc_inode( - struct super_block *sb) -{ - bhv_vnode_t *vp; - - vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); - if (unlikely(!vp)) - return NULL; - return vn_to_inode(vp); -} - -STATIC void -xfs_fs_destroy_inode( - struct inode *inode) -{ - kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode)); -} - -STATIC void -xfs_fs_inode_init_once( - void *vnode, - kmem_zone_t *zonep, - unsigned long flags) -{ - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(vn_to_inode((bhv_vnode_t *)vnode)); -} - -STATIC int -xfs_init_zones(void) -{ - xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, - xfs_fs_inode_init_once); - if (!xfs_vnode_zone) - goto out; - - xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); - if (!xfs_ioend_zone) - goto out_destroy_vnode_zone; - - xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, - xfs_ioend_zone); - if (!xfs_ioend_pool) - goto out_free_ioend_zone; - return 0; - - out_free_ioend_zone: - kmem_zone_destroy(xfs_ioend_zone); - out_destroy_vnode_zone: - kmem_zone_destroy(xfs_vnode_zone); - out: - return -ENOMEM; -} - -STATIC void -xfs_destroy_zones(void) -{ - mempool_destroy(xfs_ioend_pool); - kmem_zone_destroy(xfs_vnode_zone); - kmem_zone_destroy(xfs_ioend_zone); -} - -/* - * Attempt to flush the inode, this will actually fail - * if the inode is pinned, but we dirty the inode again - * at the point when it is unpinned after a log write, - * since this is when the inode itself becomes flushable. - */ -STATIC int -xfs_fs_write_inode( - struct inode *inode, - int sync) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - int error = 0, flags = FLUSH_INODE; - - if (vp) { - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - if (sync) - flags |= FLUSH_SYNC; - error = bhv_vop_iflush(vp, flags); - if (error == EAGAIN) - error = sync? bhv_vop_iflush(vp, flags | FLUSH_LOG) : 0; - } - return -error; -} - -STATIC void -xfs_fs_clear_inode( - struct inode *inode) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - XFS_STATS_INC(vn_rele); - XFS_STATS_INC(vn_remove); - XFS_STATS_INC(vn_reclaim); - XFS_STATS_DEC(vn_active); - - /* - * This can happen because xfs_iget_core calls xfs_idestroy if we - * find an inode with di_mode == 0 but without IGET_CREATE set. - */ - if (VNHEAD(vp)) - bhv_vop_inactive(vp, NULL); - - VN_LOCK(vp); - vp->v_flag &= ~VMODIFIED; - VN_UNLOCK(vp, 0); - - if (VNHEAD(vp)) - if (bhv_vop_reclaim(vp)) - panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, vp); - - ASSERT(VNHEAD(vp) == NULL); - -#ifdef XFS_VNODE_TRACE - ktrace_free(vp->v_trace); -#endif -} - -/* - * Enqueue a work item to be picked up by the vfs xfssyncd thread. - * Doing this has two advantages: - * - It saves on stack space, which is tight in certain situations - * - It can be used (with care) as a mechanism to avoid deadlocks. - * Flushing while allocating in a full filesystem requires both. - */ -STATIC void -xfs_syncd_queue_work( - struct bhv_vfs *vfs, - void *data, - void (*syncer)(bhv_vfs_t *, void *)) -{ - struct bhv_vfs_sync_work *work; - - work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP); - INIT_LIST_HEAD(&work->w_list); - work->w_syncer = syncer; - work->w_data = data; - work->w_vfs = vfs; - spin_lock(&vfs->vfs_sync_lock); - list_add_tail(&work->w_list, &vfs->vfs_sync_list); - spin_unlock(&vfs->vfs_sync_lock); - wake_up_process(vfs->vfs_sync_task); -} - -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room... - */ -STATIC void -xfs_flush_inode_work( - bhv_vfs_t *vfs, - void *inode) -{ - filemap_flush(((struct inode *)inode)->i_mapping); - iput((struct inode *)inode); -} - -void -xfs_flush_inode( - xfs_inode_t *ip) -{ - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - struct bhv_vfs *vfs = XFS_MTOVFS(ip->i_mount); - - igrab(inode); - xfs_syncd_queue_work(vfs, inode, xfs_flush_inode_work); - delay(msecs_to_jiffies(500)); -} - -/* - * This is the "bigger hammer" version of xfs_flush_inode_work... - * (IOW, "If at first you don't succeed, use a Bigger Hammer"). - */ -STATIC void -xfs_flush_device_work( - bhv_vfs_t *vfs, - void *inode) -{ - sync_blockdev(vfs->vfs_super->s_bdev); - iput((struct inode *)inode); -} - -void -xfs_flush_device( - xfs_inode_t *ip) -{ - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - struct bhv_vfs *vfs = XFS_MTOVFS(ip->i_mount); - - igrab(inode); - xfs_syncd_queue_work(vfs, inode, xfs_flush_device_work); - delay(msecs_to_jiffies(500)); - xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC); -} - -STATIC void -vfs_sync_worker( - bhv_vfs_t *vfsp, - void *unused) -{ - int error; - - if (!(vfsp->vfs_flag & VFS_RDONLY)) - error = bhv_vfs_sync(vfsp, SYNC_FSDATA | SYNC_BDFLUSH | \ - SYNC_ATTR | SYNC_REFCACHE, NULL); - vfsp->vfs_sync_seq++; - wmb(); - wake_up(&vfsp->vfs_wait_single_sync_task); -} - -STATIC int -xfssyncd( - void *arg) -{ - long timeleft; - bhv_vfs_t *vfsp = (bhv_vfs_t *) arg; - bhv_vfs_sync_work_t *work, *n; - LIST_HEAD (tmp); - - timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); - for (;;) { - timeleft = schedule_timeout_interruptible(timeleft); - /* swsusp */ - try_to_freeze(); - if (kthread_should_stop() && list_empty(&vfsp->vfs_sync_list)) - break; - - spin_lock(&vfsp->vfs_sync_lock); - /* - * We can get woken by laptop mode, to do a sync - - * that's the (only!) case where the list would be - * empty with time remaining. - */ - if (!timeleft || list_empty(&vfsp->vfs_sync_list)) { - if (!timeleft) - timeleft = xfs_syncd_centisecs * - msecs_to_jiffies(10); - INIT_LIST_HEAD(&vfsp->vfs_sync_work.w_list); - list_add_tail(&vfsp->vfs_sync_work.w_list, - &vfsp->vfs_sync_list); - } - list_for_each_entry_safe(work, n, &vfsp->vfs_sync_list, w_list) - list_move(&work->w_list, &tmp); - spin_unlock(&vfsp->vfs_sync_lock); - - list_for_each_entry_safe(work, n, &tmp, w_list) { - (*work->w_syncer)(vfsp, work->w_data); - list_del(&work->w_list); - if (work == &vfsp->vfs_sync_work) - continue; - kmem_free(work, sizeof(struct bhv_vfs_sync_work)); - } - } - - return 0; -} - -STATIC int -xfs_fs_start_syncd( - bhv_vfs_t *vfsp) -{ - vfsp->vfs_sync_work.w_syncer = vfs_sync_worker; - vfsp->vfs_sync_work.w_vfs = vfsp; - vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd"); - if (IS_ERR(vfsp->vfs_sync_task)) - return -PTR_ERR(vfsp->vfs_sync_task); - return 0; -} - -STATIC void -xfs_fs_stop_syncd( - bhv_vfs_t *vfsp) -{ - kthread_stop(vfsp->vfs_sync_task); -} - -STATIC void -xfs_fs_put_super( - struct super_block *sb) -{ - bhv_vfs_t *vfsp = vfs_from_sb(sb); - int error; - - xfs_fs_stop_syncd(vfsp); - bhv_vfs_sync(vfsp, SYNC_ATTR | SYNC_DELWRI, NULL); - error = bhv_vfs_unmount(vfsp, 0, NULL); - if (error) { - printk("XFS: unmount got error=%d\n", error); - printk("%s: vfs=0x%p left dangling!\n", __FUNCTION__, vfsp); - } else { - vfs_deallocate(vfsp); - } -} - -STATIC void -xfs_fs_write_super( - struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - bhv_vfs_sync(vfs_from_sb(sb), SYNC_FSDATA, NULL); - sb->s_dirt = 0; -} - -STATIC int -xfs_fs_sync_super( - struct super_block *sb, - int wait) -{ - bhv_vfs_t *vfsp = vfs_from_sb(sb); - int error; - int flags; - - if (unlikely(sb->s_frozen == SB_FREEZE_WRITE)) - flags = SYNC_QUIESCE; - else - flags = SYNC_FSDATA | (wait ? SYNC_WAIT : 0); - - error = bhv_vfs_sync(vfsp, flags, NULL); - sb->s_dirt = 0; - - if (unlikely(laptop_mode)) { - int prev_sync_seq = vfsp->vfs_sync_seq; - - /* - * The disk must be active because we're syncing. - * We schedule xfssyncd now (now that the disk is - * active) instead of later (when it might not be). - */ - wake_up_process(vfsp->vfs_sync_task); - /* - * We have to wait for the sync iteration to complete. - * If we don't, the disk activity caused by the sync - * will come after the sync is completed, and that - * triggers another sync from laptop mode. - */ - wait_event(vfsp->vfs_wait_single_sync_task, - vfsp->vfs_sync_seq != prev_sync_seq); - } - - return -error; -} - -STATIC int -xfs_fs_statfs( - struct dentry *dentry, - struct kstatfs *statp) -{ - return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, - vn_from_inode(dentry->d_inode)); -} - -STATIC int -xfs_fs_remount( - struct super_block *sb, - int *flags, - char *options) -{ - bhv_vfs_t *vfsp = vfs_from_sb(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb, 0); - int error; - - error = bhv_vfs_parseargs(vfsp, options, args, 1); - if (!error) - error = bhv_vfs_mntupdate(vfsp, flags, args); - kmem_free(args, sizeof(*args)); - return -error; -} - -STATIC void -xfs_fs_lockfs( - struct super_block *sb) -{ - bhv_vfs_freeze(vfs_from_sb(sb)); -} - -STATIC int -xfs_fs_show_options( - struct seq_file *m, - struct vfsmount *mnt) -{ - return -bhv_vfs_showargs(vfs_from_sb(mnt->mnt_sb), m); -} - -STATIC int -xfs_fs_quotasync( - struct super_block *sb, - int type) -{ - return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XQUOTASYNC, 0, NULL); -} - -STATIC int -xfs_fs_getxstate( - struct super_block *sb, - struct fs_quota_stat *fqs) -{ - return -bhv_vfs_quotactl(vfs_from_sb(sb), Q_XGETQSTAT, 0, (caddr_t)fqs); -} - -STATIC int -xfs_fs_setxstate( - struct super_block *sb, - unsigned int flags, - int op) -{ - return -bhv_vfs_quotactl(vfs_from_sb(sb), op, 0, (caddr_t)&flags); -} - -STATIC int -xfs_fs_getxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -bhv_vfs_quotactl(vfs_from_sb(sb), - (type == USRQUOTA) ? Q_XGETQUOTA : - ((type == GRPQUOTA) ? Q_XGETGQUOTA : - Q_XGETPQUOTA), id, (caddr_t)fdq); -} - -STATIC int -xfs_fs_setxquota( - struct super_block *sb, - int type, - qid_t id, - struct fs_disk_quota *fdq) -{ - return -bhv_vfs_quotactl(vfs_from_sb(sb), - (type == USRQUOTA) ? Q_XSETQLIM : - ((type == GRPQUOTA) ? Q_XSETGQLIM : - Q_XSETPQLIM), id, (caddr_t)fdq); -} - -STATIC int -xfs_fs_fill_super( - struct super_block *sb, - void *data, - int silent) -{ - struct bhv_vnode *rootvp; - struct bhv_vfs *vfsp = vfs_allocate(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb, silent); - struct kstatfs statvfs; - int error; - - bhv_insert_all_vfsops(vfsp); - - error = bhv_vfs_parseargs(vfsp, (char *)data, args, 0); - if (error) { - bhv_remove_all_vfsops(vfsp, 1); - goto fail_vfsop; - } - - sb_min_blocksize(sb, BBSIZE); - sb->s_export_op = &xfs_export_operations; - sb->s_qcop = &xfs_quotactl_operations; - sb->s_op = &xfs_super_operations; - - error = bhv_vfs_mount(vfsp, args, NULL); - if (error) { - bhv_remove_all_vfsops(vfsp, 1); - goto fail_vfsop; - } - - error = bhv_vfs_statvfs(vfsp, &statvfs, NULL); - if (error) - goto fail_unmount; - - sb->s_dirt = 1; - sb->s_magic = statvfs.f_type; - sb->s_blocksize = statvfs.f_bsize; - sb->s_blocksize_bits = ffs(statvfs.f_bsize) - 1; - sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); - sb->s_time_gran = 1; - set_posix_acl_flag(sb); - - error = bhv_vfs_root(vfsp, &rootvp); - if (error) - goto fail_unmount; - - sb->s_root = d_alloc_root(vn_to_inode(rootvp)); - if (!sb->s_root) { - error = ENOMEM; - goto fail_vnrele; - } - if (is_bad_inode(sb->s_root->d_inode)) { - error = EINVAL; - goto fail_vnrele; - } - if ((error = xfs_fs_start_syncd(vfsp))) - goto fail_vnrele; - vn_trace_exit(rootvp, __FUNCTION__, (inst_t *)__return_address); - - kmem_free(args, sizeof(*args)); - return 0; - -fail_vnrele: - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } else { - VN_RELE(rootvp); - } - -fail_unmount: - bhv_vfs_unmount(vfsp, 0, NULL); - -fail_vfsop: - vfs_deallocate(vfsp); - kmem_free(args, sizeof(*args)); - return -error; -} - -STATIC int -xfs_fs_get_sb( - struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data, - struct vfsmount *mnt) -{ - return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, - mnt); -} - -STATIC struct super_operations xfs_super_operations = { - .alloc_inode = xfs_fs_alloc_inode, - .destroy_inode = xfs_fs_destroy_inode, - .write_inode = xfs_fs_write_inode, - .clear_inode = xfs_fs_clear_inode, - .put_super = xfs_fs_put_super, - .write_super = xfs_fs_write_super, - .sync_fs = xfs_fs_sync_super, - .write_super_lockfs = xfs_fs_lockfs, - .statfs = xfs_fs_statfs, - .remount_fs = xfs_fs_remount, - .show_options = xfs_fs_show_options, -}; - -STATIC struct quotactl_ops xfs_quotactl_operations = { - .quota_sync = xfs_fs_quotasync, - .get_xstate = xfs_fs_getxstate, - .set_xstate = xfs_fs_setxstate, - .get_xquota = xfs_fs_getxquota, - .set_xquota = xfs_fs_setxquota, -}; - -STATIC struct file_system_type xfs_fs_type = { - .owner = THIS_MODULE, - .name = "xfs", - .get_sb = xfs_fs_get_sb, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; - - -STATIC int __init -init_xfs_fs( void ) -{ - int error; - struct sysinfo si; - static char message[] __initdata = KERN_INFO \ - XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n"; - - printk(message); - - si_meminfo(&si); - xfs_physmem = si.totalram; - - ktrace_init(64); - - error = xfs_init_zones(); - if (error < 0) - goto undo_zones; - - error = xfs_buf_init(); - if (error < 0) - goto undo_buffers; - - vn_init(); - xfs_init(); - uuid_init(); - vfs_initquota(); - - error = register_filesystem(&xfs_fs_type); - if (error) - goto undo_register; - return 0; - -undo_register: - xfs_buf_terminate(); - -undo_buffers: - xfs_destroy_zones(); - -undo_zones: - return error; -} - -STATIC void __exit -exit_xfs_fs( void ) -{ - vfs_exitquota(); - unregister_filesystem(&xfs_fs_type); - xfs_cleanup(); - xfs_buf_terminate(); - xfs_destroy_zones(); - ktrace_uninit(); -} - -module_init(init_xfs_fs); -module_exit(exit_xfs_fs); - -MODULE_AUTHOR("Silicon Graphics, Inc."); -MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); -MODULE_LICENSE("GPL"); diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c deleted file mode 100644 index af246532fbf..00000000000 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2001-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include <linux/sysctl.h> -#include <linux/proc_fs.h> - -static struct ctl_table_header *xfs_table_header; - -#ifdef CONFIG_PROC_FS -STATIC int -xfs_stats_clear_proc_handler( - ctl_table *ctl, - int write, - struct file *filp, - void __user *buffer, - size_t *lenp, - loff_t *ppos) -{ - int c, ret, *valp = ctl->data; - __uint32_t vn_active; - - ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); - - if (!ret && write && *valp) { - printk("XFS Clearing xfsstats\n"); - for_each_possible_cpu(c) { - preempt_disable(); - /* save vn_active, it's a universal truth! */ - vn_active = per_cpu(xfsstats, c).vn_active; - memset(&per_cpu(xfsstats, c), 0, - sizeof(struct xfsstats)); - per_cpu(xfsstats, c).vn_active = vn_active; - preempt_enable(); - } - xfs_stats_clear = 0; - } - - return ret; -} -#endif /* CONFIG_PROC_FS */ - -STATIC ctl_table xfs_table[] = { - {XFS_RESTRICT_CHOWN, "restrict_chown", &xfs_params.restrict_chown.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.restrict_chown.min, &xfs_params.restrict_chown.max}, - - {XFS_SGID_INHERIT, "irix_sgid_inherit", &xfs_params.sgid_inherit.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.sgid_inherit.min, &xfs_params.sgid_inherit.max}, - - {XFS_SYMLINK_MODE, "irix_symlink_mode", &xfs_params.symlink_mode.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.symlink_mode.min, &xfs_params.symlink_mode.max}, - - {XFS_PANIC_MASK, "panic_mask", &xfs_params.panic_mask.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.panic_mask.min, &xfs_params.panic_mask.max}, - - {XFS_ERRLEVEL, "error_level", &xfs_params.error_level.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.error_level.min, &xfs_params.error_level.max}, - - {XFS_SYNCD_TIMER, "xfssyncd_centisecs", &xfs_params.syncd_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.syncd_timer.min, &xfs_params.syncd_timer.max}, - - {XFS_INHERIT_SYNC, "inherit_sync", &xfs_params.inherit_sync.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_sync.min, &xfs_params.inherit_sync.max}, - - {XFS_INHERIT_NODUMP, "inherit_nodump", &xfs_params.inherit_nodump.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodump.min, &xfs_params.inherit_nodump.max}, - - {XFS_INHERIT_NOATIME, "inherit_noatime", &xfs_params.inherit_noatim.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_noatim.min, &xfs_params.inherit_noatim.max}, - - {XFS_BUF_TIMER, "xfsbufd_centisecs", &xfs_params.xfs_buf_timer.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_timer.min, &xfs_params.xfs_buf_timer.max}, - - {XFS_BUF_AGE, "age_buffer_centisecs", &xfs_params.xfs_buf_age.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.xfs_buf_age.min, &xfs_params.xfs_buf_age.max}, - - {XFS_INHERIT_NOSYM, "inherit_nosymlinks", &xfs_params.inherit_nosym.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nosym.min, &xfs_params.inherit_nosym.max}, - - {XFS_ROTORSTEP, "rotorstep", &xfs_params.rotorstep.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.rotorstep.min, &xfs_params.rotorstep.max}, - - {XFS_INHERIT_NODFRG, "inherit_nodefrag", &xfs_params.inherit_nodfrg.val, - sizeof(int), 0644, NULL, &proc_dointvec_minmax, - &sysctl_intvec, NULL, - &xfs_params.inherit_nodfrg.min, &xfs_params.inherit_nodfrg.max}, - - /* please keep this the last entry */ -#ifdef CONFIG_PROC_FS - {XFS_STATS_CLEAR, "stats_clear", &xfs_params.stats_clear.val, - sizeof(int), 0644, NULL, &xfs_stats_clear_proc_handler, - &sysctl_intvec, NULL, - &xfs_params.stats_clear.min, &xfs_params.stats_clear.max}, -#endif /* CONFIG_PROC_FS */ - - {0} -}; - -STATIC ctl_table xfs_dir_table[] = { - {FS_XFS, "xfs", NULL, 0, 0555, xfs_table}, - {0} -}; - -STATIC ctl_table xfs_root_table[] = { - {CTL_FS, "fs", NULL, 0, 0555, xfs_dir_table}, - {0} -}; - -void -xfs_sysctl_register(void) -{ - xfs_table_header = register_sysctl_table(xfs_root_table, 1); -} - -void -xfs_sysctl_unregister(void) -{ - if (xfs_table_header) - unregister_sysctl_table(xfs_table_header); -} diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c deleted file mode 100644 index 6145e8bd0be..00000000000 --- a/fs/xfs/linux-2.6/xfs_vfs.c +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_inum.h" -#include "xfs_log.h" -#include "xfs_clnt.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_imap.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_quota.h" - -int -vfs_mount( - struct bhv_desc *bdp, - struct xfs_mount_args *args, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_mount) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_mount)(next, args, cr)); -} - -int -vfs_parseargs( - struct bhv_desc *bdp, - char *s, - struct xfs_mount_args *args, - int f) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_parseargs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_parseargs)(next, s, args, f)); -} - -int -vfs_showargs( - struct bhv_desc *bdp, - struct seq_file *m) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_showargs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_showargs)(next, m)); -} - -int -vfs_unmount( - struct bhv_desc *bdp, - int fl, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_unmount) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_unmount)(next, fl, cr)); -} - -int -vfs_mntupdate( - struct bhv_desc *bdp, - int *fl, - struct xfs_mount_args *args) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_mntupdate) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_mntupdate)(next, fl, args)); -} - -int -vfs_root( - struct bhv_desc *bdp, - struct bhv_vnode **vpp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_root) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_root)(next, vpp)); -} - -int -vfs_statvfs( - struct bhv_desc *bdp, - bhv_statvfs_t *statp, - struct bhv_vnode *vp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_statvfs) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_statvfs)(next, statp, vp)); -} - -int -vfs_sync( - struct bhv_desc *bdp, - int fl, - struct cred *cr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_sync) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_sync)(next, fl, cr)); -} - -int -vfs_vget( - struct bhv_desc *bdp, - struct bhv_vnode **vpp, - struct fid *fidp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_vget) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_vget)(next, vpp, fidp)); -} - -int -vfs_dmapiops( - struct bhv_desc *bdp, - caddr_t addr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_dmapiops) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_dmapiops)(next, addr)); -} - -int -vfs_quotactl( - struct bhv_desc *bdp, - int cmd, - int id, - caddr_t addr) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_quotactl) - next = BHV_NEXT(next); - return ((*bhvtovfsops(next)->vfs_quotactl)(next, cmd, id, addr)); -} - -void -vfs_init_vnode( - struct bhv_desc *bdp, - struct bhv_vnode *vp, - struct bhv_desc *bp, - int unlock) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_init_vnode) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_init_vnode)(next, vp, bp, unlock)); -} - -void -vfs_force_shutdown( - struct bhv_desc *bdp, - int fl, - char *file, - int line) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_force_shutdown) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_force_shutdown)(next, fl, file, line)); -} - -void -vfs_freeze( - struct bhv_desc *bdp) -{ - struct bhv_desc *next = bdp; - - ASSERT(next); - while (! (bhvtovfsops(next))->vfs_freeze) - next = BHV_NEXT(next); - ((*bhvtovfsops(next)->vfs_freeze)(next)); -} - -bhv_vfs_t * -vfs_allocate( - struct super_block *sb) -{ - struct bhv_vfs *vfsp; - - vfsp = kmem_zalloc(sizeof(bhv_vfs_t), KM_SLEEP); - bhv_head_init(VFS_BHVHEAD(vfsp), "vfs"); - INIT_LIST_HEAD(&vfsp->vfs_sync_list); - spin_lock_init(&vfsp->vfs_sync_lock); - init_waitqueue_head(&vfsp->vfs_wait_single_sync_task); - - vfsp->vfs_super = sb; - sb->s_fs_info = vfsp; - - if (sb->s_flags & MS_RDONLY) - vfsp->vfs_flag |= VFS_RDONLY; - - return vfsp; -} - -bhv_vfs_t * -vfs_from_sb( - struct super_block *sb) -{ - return (bhv_vfs_t *)sb->s_fs_info; -} - -void -vfs_deallocate( - struct bhv_vfs *vfsp) -{ - bhv_head_destroy(VFS_BHVHEAD(vfsp)); - kmem_free(vfsp, sizeof(bhv_vfs_t)); -} - -void -vfs_insertops( - struct bhv_vfs *vfsp, - struct bhv_module_vfsops *vfsops) -{ - struct bhv_desc *bdp; - - bdp = kmem_alloc(sizeof(struct bhv_desc), KM_SLEEP); - bhv_desc_init(bdp, NULL, vfsp, vfsops); - bhv_insert(&vfsp->vfs_bh, bdp); -} - -void -vfs_insertbhv( - struct bhv_vfs *vfsp, - struct bhv_desc *bdp, - struct bhv_vfsops *vfsops, - void *mount) -{ - bhv_desc_init(bdp, mount, vfsp, vfsops); - bhv_insert_initial(&vfsp->vfs_bh, bdp); -} - -void -bhv_remove_vfsops( - struct bhv_vfs *vfsp, - int pos) -{ - struct bhv_desc *bhv; - - bhv = bhv_lookup_range(&vfsp->vfs_bh, pos, pos); - if (!bhv) - return; - bhv_remove(&vfsp->vfs_bh, bhv); - kmem_free(bhv, sizeof(*bhv)); -} - -void -bhv_remove_all_vfsops( - struct bhv_vfs *vfsp, - int freebase) -{ - struct xfs_mount *mp; - - bhv_remove_vfsops(vfsp, VFS_POSITION_QM); - bhv_remove_vfsops(vfsp, VFS_POSITION_DM); - if (!freebase) - return; - mp = XFS_VFSTOM(vfsp); - VFS_REMOVEBHV(vfsp, &mp->m_bhv); - xfs_mount_free(mp, 0); -} - -void -bhv_insert_all_vfsops( - struct bhv_vfs *vfsp) -{ - struct xfs_mount *mp; - - mp = xfs_mount_init(); - vfs_insertbhv(vfsp, &mp->m_bhv, &xfs_vfsops, mp); - vfs_insertdmapi(vfsp); - vfs_insertquota(vfsp); -} diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h deleted file mode 100644 index 91fc2c4b335..00000000000 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VFS_H__ -#define __XFS_VFS_H__ - -#include <linux/vfs.h> -#include "xfs_fs.h" - -struct bhv_vfs; -struct bhv_vnode; - -struct fid; -struct cred; -struct seq_file; -struct super_block; -struct xfs_mount_args; - -typedef struct kstatfs bhv_statvfs_t; - -typedef struct bhv_vfs_sync_work { - struct list_head w_list; - struct bhv_vfs *w_vfs; - void *w_data; /* syncer routine argument */ - void (*w_syncer)(struct bhv_vfs *, void *); -} bhv_vfs_sync_work_t; - -typedef struct bhv_vfs { - u_int vfs_flag; /* flags */ - xfs_fsid_t vfs_fsid; /* file system ID */ - xfs_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ - bhv_head_t vfs_bh; /* head of vfs behavior chain */ - struct super_block *vfs_super; /* generic superblock pointer */ - struct task_struct *vfs_sync_task; /* generalised sync thread */ - bhv_vfs_sync_work_t vfs_sync_work; /* work item for VFS_SYNC */ - struct list_head vfs_sync_list; /* sync thread work item list */ - spinlock_t vfs_sync_lock; /* work item list lock */ - int vfs_sync_seq; /* sync thread generation no. */ - wait_queue_head_t vfs_wait_single_sync_task; -} bhv_vfs_t; - -#define bhvtovfs(bdp) ( (struct bhv_vfs *)BHV_VOBJ(bdp) ) -#define bhvtovfsops(bdp) ( (struct bhv_vfsops *)BHV_OPS(bdp) ) -#define VFS_BHVHEAD(vfs) ( &(vfs)->vfs_bh ) -#define VFS_REMOVEBHV(vfs, bdp) ( bhv_remove(VFS_BHVHEAD(vfs), bdp) ) - -#define VFS_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ -#define VFS_POSITION_TOP BHV_POSITION_TOP /* chain top */ -#define VFS_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ - -typedef enum { - VFS_BHV_UNKNOWN, /* not specified */ - VFS_BHV_XFS, /* xfs */ - VFS_BHV_DM, /* data migration */ - VFS_BHV_QM, /* quota manager */ - VFS_BHV_IO, /* IO path */ - VFS_BHV_END /* housekeeping end-of-range */ -} bhv_vfs_type_t; - -#define VFS_POSITION_XFS (BHV_POSITION_BASE) -#define VFS_POSITION_DM (VFS_POSITION_BASE+10) -#define VFS_POSITION_QM (VFS_POSITION_BASE+20) -#define VFS_POSITION_IO (VFS_POSITION_BASE+30) - -#define VFS_RDONLY 0x0001 /* read-only vfs */ -#define VFS_GRPID 0x0002 /* group-ID assigned from directory */ -#define VFS_DMI 0x0004 /* filesystem has the DMI enabled */ -#define VFS_UMOUNT 0x0008 /* unmount in progress */ -#define VFS_32BITINODES 0x0010 /* do not use inums above 32 bits */ -#define VFS_END 0x0010 /* max flag */ - -#define SYNC_ATTR 0x0001 /* sync attributes */ -#define SYNC_CLOSE 0x0002 /* close file system down */ -#define SYNC_DELWRI 0x0004 /* look at delayed writes */ -#define SYNC_WAIT 0x0008 /* wait for i/o to complete */ -#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */ -#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ -#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ -#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ -#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */ - -#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ -#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ -#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ -#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ -#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ -#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ - -typedef int (*vfs_mount_t)(bhv_desc_t *, - struct xfs_mount_args *, struct cred *); -typedef int (*vfs_parseargs_t)(bhv_desc_t *, char *, - struct xfs_mount_args *, int); -typedef int (*vfs_showargs_t)(bhv_desc_t *, struct seq_file *); -typedef int (*vfs_unmount_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vfs_mntupdate_t)(bhv_desc_t *, int *, - struct xfs_mount_args *); -typedef int (*vfs_root_t)(bhv_desc_t *, struct bhv_vnode **); -typedef int (*vfs_statvfs_t)(bhv_desc_t *, bhv_statvfs_t *, - struct bhv_vnode *); -typedef int (*vfs_sync_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vfs_vget_t)(bhv_desc_t *, struct bhv_vnode **, struct fid *); -typedef int (*vfs_dmapiops_t)(bhv_desc_t *, caddr_t); -typedef int (*vfs_quotactl_t)(bhv_desc_t *, int, int, caddr_t); -typedef void (*vfs_init_vnode_t)(bhv_desc_t *, - struct bhv_vnode *, bhv_desc_t *, int); -typedef void (*vfs_force_shutdown_t)(bhv_desc_t *, int, char *, int); -typedef void (*vfs_freeze_t)(bhv_desc_t *); - -typedef struct bhv_vfsops { - bhv_position_t vf_position; /* behavior chain position */ - vfs_mount_t vfs_mount; /* mount file system */ - vfs_parseargs_t vfs_parseargs; /* parse mount options */ - vfs_showargs_t vfs_showargs; /* unparse mount options */ - vfs_unmount_t vfs_unmount; /* unmount file system */ - vfs_mntupdate_t vfs_mntupdate; /* update file system options */ - vfs_root_t vfs_root; /* get root vnode */ - vfs_statvfs_t vfs_statvfs; /* file system statistics */ - vfs_sync_t vfs_sync; /* flush files */ - vfs_vget_t vfs_vget; /* get vnode from fid */ - vfs_dmapiops_t vfs_dmapiops; /* data migration */ - vfs_quotactl_t vfs_quotactl; /* disk quota */ - vfs_init_vnode_t vfs_init_vnode; /* initialize a new vnode */ - vfs_force_shutdown_t vfs_force_shutdown; /* crash and burn */ - vfs_freeze_t vfs_freeze; /* freeze fs for snapshot */ -} bhv_vfsops_t; - -/* - * Virtual filesystem operations, operating from head bhv. - */ -#define VFSHEAD(v) ((v)->vfs_bh.bh_first) -#define bhv_vfs_mount(v, ma,cr) vfs_mount(VFSHEAD(v), ma,cr) -#define bhv_vfs_parseargs(v, o,ma,f) vfs_parseargs(VFSHEAD(v), o,ma,f) -#define bhv_vfs_showargs(v, m) vfs_showargs(VFSHEAD(v), m) -#define bhv_vfs_unmount(v, f,cr) vfs_unmount(VFSHEAD(v), f,cr) -#define bhv_vfs_mntupdate(v, fl,args) vfs_mntupdate(VFSHEAD(v), fl,args) -#define bhv_vfs_root(v, vpp) vfs_root(VFSHEAD(v), vpp) -#define bhv_vfs_statvfs(v, sp,vp) vfs_statvfs(VFSHEAD(v), sp,vp) -#define bhv_vfs_sync(v, flag,cr) vfs_sync(VFSHEAD(v), flag,cr) -#define bhv_vfs_vget(v, vpp,fidp) vfs_vget(VFSHEAD(v), vpp,fidp) -#define bhv_vfs_dmapiops(v, p) vfs_dmapiops(VFSHEAD(v), p) -#define bhv_vfs_quotactl(v, c,id,p) vfs_quotactl(VFSHEAD(v), c,id,p) -#define bhv_vfs_init_vnode(v, vp,b,ul) vfs_init_vnode(VFSHEAD(v), vp,b,ul) -#define bhv_vfs_force_shutdown(v,u,f,l) vfs_force_shutdown(VFSHEAD(v), u,f,l) -#define bhv_vfs_freeze(v) vfs_freeze(VFSHEAD(v)) - -/* - * Virtual filesystem operations, operating from next bhv. - */ -#define bhv_next_vfs_mount(b, ma,cr) vfs_mount(b, ma,cr) -#define bhv_next_vfs_parseargs(b, o,ma,f) vfs_parseargs(b, o,ma,f) -#define bhv_next_vfs_showargs(b, m) vfs_showargs(b, m) -#define bhv_next_vfs_unmount(b, f,cr) vfs_unmount(b, f,cr) -#define bhv_next_vfs_mntupdate(b, fl,args) vfs_mntupdate(b, fl, args) -#define bhv_next_vfs_root(b, vpp) vfs_root(b, vpp) -#define bhv_next_vfs_statvfs(b, sp,vp) vfs_statvfs(b, sp,vp) -#define bhv_next_vfs_sync(b, flag,cr) vfs_sync(b, flag,cr) -#define bhv_next_vfs_vget(b, vpp,fidp) vfs_vget(b, vpp,fidp) -#define bhv_next_vfs_dmapiops(b, p) vfs_dmapiops(b, p) -#define bhv_next_vfs_quotactl(b, c,id,p) vfs_quotactl(b, c,id,p) -#define bhv_next_vfs_init_vnode(b, vp,b2,ul) vfs_init_vnode(b, vp,b2,ul) -#define bhv_next_force_shutdown(b, fl,f,l) vfs_force_shutdown(b, fl,f,l) -#define bhv_next_vfs_freeze(b) vfs_freeze(b) - -extern int vfs_mount(bhv_desc_t *, struct xfs_mount_args *, struct cred *); -extern int vfs_parseargs(bhv_desc_t *, char *, struct xfs_mount_args *, int); -extern int vfs_showargs(bhv_desc_t *, struct seq_file *); -extern int vfs_unmount(bhv_desc_t *, int, struct cred *); -extern int vfs_mntupdate(bhv_desc_t *, int *, struct xfs_mount_args *); -extern int vfs_root(bhv_desc_t *, struct bhv_vnode **); -extern int vfs_statvfs(bhv_desc_t *, bhv_statvfs_t *, struct bhv_vnode *); -extern int vfs_sync(bhv_desc_t *, int, struct cred *); -extern int vfs_vget(bhv_desc_t *, struct bhv_vnode **, struct fid *); -extern int vfs_dmapiops(bhv_desc_t *, caddr_t); -extern int vfs_quotactl(bhv_desc_t *, int, int, caddr_t); -extern void vfs_init_vnode(bhv_desc_t *, struct bhv_vnode *, bhv_desc_t *, int); -extern void vfs_force_shutdown(bhv_desc_t *, int, char *, int); -extern void vfs_freeze(bhv_desc_t *); - -#define vfs_test_for_freeze(vfs) ((vfs)->vfs_super->s_frozen) -#define vfs_wait_for_freeze(vfs,l) vfs_check_frozen((vfs)->vfs_super, (l)) - -typedef struct bhv_module_vfsops { - struct bhv_vfsops bhv_common; - void * bhv_custom; -} bhv_module_vfsops_t; - -#define vfs_bhv_lookup(v, id) (bhv_lookup_range(&(v)->vfs_bh, (id), (id))) -#define vfs_bhv_custom(b) (((bhv_module_vfsops_t*)BHV_OPS(b))->bhv_custom) -#define vfs_bhv_set_custom(b,o) ((b)->bhv_custom = (void *)(o)) -#define vfs_bhv_clr_custom(b) ((b)->bhv_custom = NULL) - -extern bhv_vfs_t *vfs_allocate(struct super_block *); -extern bhv_vfs_t *vfs_from_sb(struct super_block *); -extern void vfs_deallocate(bhv_vfs_t *); -extern void vfs_insertbhv(bhv_vfs_t *, bhv_desc_t *, bhv_vfsops_t *, void *); - -extern void vfs_insertops(bhv_vfs_t *, bhv_module_vfsops_t *); - -extern void bhv_insert_all_vfsops(struct bhv_vfs *); -extern void bhv_remove_all_vfsops(struct bhv_vfs *, int); -extern void bhv_remove_vfsops(struct bhv_vfs *, int); - -#endif /* __XFS_VFS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c deleted file mode 100644 index 553fa731ade..00000000000 --- a/fs/xfs/linux-2.6/xfs_vnode.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" - -uint64_t vn_generation; /* vnode generation number */ -DEFINE_SPINLOCK(vnumber_lock); - -/* - * Dedicated vnode inactive/reclaim sync semaphores. - * Prime number of hash buckets since address is used as the key. - */ -#define NVSYNC 37 -#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC]) -STATIC wait_queue_head_t vsync[NVSYNC]; - -void -vn_init(void) -{ - int i; - - for (i = 0; i < NVSYNC; i++) - init_waitqueue_head(&vsync[i]); -} - -void -vn_iowait( - bhv_vnode_t *vp) -{ - wait_queue_head_t *wq = vptosync(vp); - - wait_event(*wq, (atomic_read(&vp->v_iocount) == 0)); -} - -void -vn_iowake( - bhv_vnode_t *vp) -{ - if (atomic_dec_and_test(&vp->v_iocount)) - wake_up(vptosync(vp)); -} - -/* - * Volume managers supporting multiple paths can send back ENODEV when the - * final path disappears. In this case continuing to fill the page cache - * with dirty data which cannot be written out is evil, so prevent that. - */ -void -vn_ioerror( - bhv_vnode_t *vp, - int error, - char *f, - int l) -{ - if (unlikely(error == -ENODEV)) - bhv_vfs_force_shutdown(vp->v_vfsp, SHUTDOWN_DEVICE_REQ, f, l); -} - -bhv_vnode_t * -vn_initialize( - struct inode *inode) -{ - bhv_vnode_t *vp = vn_from_inode(inode); - - XFS_STATS_INC(vn_active); - XFS_STATS_INC(vn_alloc); - - vp->v_flag = VMODIFIED; - spinlock_init(&vp->v_lock, "v_lock"); - - spin_lock(&vnumber_lock); - if (!++vn_generation) /* v_number shouldn't be zero */ - vn_generation++; - vp->v_number = vn_generation; - spin_unlock(&vnumber_lock); - - ASSERT(VN_CACHED(vp) == 0); - - /* Initialize the first behavior and the behavior chain head. */ - vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode"); - - atomic_set(&vp->v_iocount, 0); - -#ifdef XFS_VNODE_TRACE - vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP); -#endif /* XFS_VNODE_TRACE */ - - vn_trace_exit(vp, __FUNCTION__, (inst_t *)__return_address); - return vp; -} - -/* - * Revalidate the Linux inode from the vattr. - * Note: i_size _not_ updated; we must hold the inode - * semaphore when doing that - callers responsibility. - */ -void -vn_revalidate_core( - bhv_vnode_t *vp, - bhv_vattr_t *vap) -{ - struct inode *inode = vn_to_inode(vp); - - inode->i_mode = vap->va_mode; - inode->i_nlink = vap->va_nlink; - inode->i_uid = vap->va_uid; - inode->i_gid = vap->va_gid; - inode->i_blocks = vap->va_nblocks; - inode->i_mtime = vap->va_mtime; - inode->i_ctime = vap->va_ctime; - if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (vap->va_xflags & XFS_XFLAG_APPEND) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (vap->va_xflags & XFS_XFLAG_SYNC) - inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (vap->va_xflags & XFS_XFLAG_NOATIME) - inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; -} - -/* - * Revalidate the Linux inode from the vnode. - */ -int -__vn_revalidate( - bhv_vnode_t *vp, - bhv_vattr_t *vattr) -{ - int error; - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - vattr->va_mask = XFS_AT_STAT | XFS_AT_XFLAGS; - error = bhv_vop_getattr(vp, vattr, 0, NULL); - if (likely(!error)) { - vn_revalidate_core(vp, vattr); - VUNMODIFY(vp); - } - return -error; -} - -int -vn_revalidate( - bhv_vnode_t *vp) -{ - bhv_vattr_t vattr; - - return __vn_revalidate(vp, &vattr); -} - -/* - * Add a reference to a referenced vnode. - */ -bhv_vnode_t * -vn_hold( - bhv_vnode_t *vp) -{ - struct inode *inode; - - XFS_STATS_INC(vn_hold); - - VN_LOCK(vp); - inode = igrab(vn_to_inode(vp)); - ASSERT(inode); - VN_UNLOCK(vp, 0); - - return vp; -} - -#ifdef XFS_VNODE_TRACE - -#define KTRACE_ENTER(vp, vk, s, line, ra) \ - ktrace_enter( (vp)->v_trace, \ -/* 0 */ (void *)(__psint_t)(vk), \ -/* 1 */ (void *)(s), \ -/* 2 */ (void *)(__psint_t) line, \ -/* 3 */ (void *)(__psint_t)(vn_count(vp)), \ -/* 4 */ (void *)(ra), \ -/* 5 */ (void *)(__psunsigned_t)(vp)->v_flag, \ -/* 6 */ (void *)(__psint_t)current_cpu(), \ -/* 7 */ (void *)(__psint_t)current_pid(), \ -/* 8 */ (void *)__return_address, \ -/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL) - -/* - * Vnode tracing code. - */ -void -vn_trace_entry(bhv_vnode_t *vp, const char *func, inst_t *ra) -{ - KTRACE_ENTER(vp, VNODE_KTRACE_ENTRY, func, 0, ra); -} - -void -vn_trace_exit(bhv_vnode_t *vp, const char *func, inst_t *ra) -{ - KTRACE_ENTER(vp, VNODE_KTRACE_EXIT, func, 0, ra); -} - -void -vn_trace_hold(bhv_vnode_t *vp, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(vp, VNODE_KTRACE_HOLD, file, line, ra); -} - -void -vn_trace_ref(bhv_vnode_t *vp, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(vp, VNODE_KTRACE_REF, file, line, ra); -} - -void -vn_trace_rele(bhv_vnode_t *vp, char *file, int line, inst_t *ra) -{ - KTRACE_ENTER(vp, VNODE_KTRACE_RELE, file, line, ra); -} -#endif /* XFS_VNODE_TRACE */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h deleted file mode 100644 index c42b3221b20..00000000000 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ /dev/null @@ -1,605 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VNODE_H__ -#define __XFS_VNODE_H__ - -struct uio; -struct file; -struct bhv_vfs; -struct bhv_vattr; -struct xfs_iomap; -struct attrlist_cursor_kern; - -typedef struct dentry bhv_vname_t; -typedef __u64 bhv_vnumber_t; - -typedef enum bhv_vflags { - VMODIFIED = 0x08, /* XFS inode state possibly differs */ - /* to the Linux inode state. */ - VTRUNCATED = 0x40, /* truncated down so flush-on-close */ -} bhv_vflags_t; - -/* - * MP locking protocols: - * v_flag, v_vfsp VN_LOCK/VN_UNLOCK - */ -typedef struct bhv_vnode { - bhv_vflags_t v_flag; /* vnode flags (see above) */ - bhv_vfs_t *v_vfsp; /* ptr to containing VFS */ - bhv_vnumber_t v_number; /* in-core vnode number */ - bhv_head_t v_bh; /* behavior head */ - spinlock_t v_lock; /* VN_LOCK/VN_UNLOCK */ - atomic_t v_iocount; /* outstanding I/O count */ -#ifdef XFS_VNODE_TRACE - struct ktrace *v_trace; /* trace header structure */ -#endif - struct inode v_inode; /* Linux inode */ - /* inode MUST be last */ -} bhv_vnode_t; - -#define VN_ISLNK(vp) S_ISLNK((vp)->v_inode.i_mode) -#define VN_ISREG(vp) S_ISREG((vp)->v_inode.i_mode) -#define VN_ISDIR(vp) S_ISDIR((vp)->v_inode.i_mode) -#define VN_ISCHR(vp) S_ISCHR((vp)->v_inode.i_mode) -#define VN_ISBLK(vp) S_ISBLK((vp)->v_inode.i_mode) - -#define VNODE_POSITION_BASE BHV_POSITION_BASE /* chain bottom */ -#define VNODE_POSITION_TOP BHV_POSITION_TOP /* chain top */ -#define VNODE_POSITION_INVALID BHV_POSITION_INVALID /* invalid pos. num */ - -typedef enum { - VN_BHV_UNKNOWN, /* not specified */ - VN_BHV_XFS, /* xfs */ - VN_BHV_DM, /* data migration */ - VN_BHV_QM, /* quota manager */ - VN_BHV_IO, /* IO path */ - VN_BHV_END /* housekeeping end-of-range */ -} vn_bhv_t; - -#define VNODE_POSITION_XFS (VNODE_POSITION_BASE) -#define VNODE_POSITION_DM (VNODE_POSITION_BASE+10) -#define VNODE_POSITION_QM (VNODE_POSITION_BASE+20) -#define VNODE_POSITION_IO (VNODE_POSITION_BASE+30) - -/* - * Macros for dealing with the behavior descriptor inside of the vnode. - */ -#define BHV_TO_VNODE(bdp) ((bhv_vnode_t *)BHV_VOBJ(bdp)) -#define BHV_TO_VNODE_NULL(bdp) ((bhv_vnode_t *)BHV_VOBJNULL(bdp)) - -#define VN_BHV_HEAD(vp) ((bhv_head_t *)(&((vp)->v_bh))) -#define vn_bhv_head_init(bhp,name) bhv_head_init(bhp,name) -#define vn_bhv_remove(bhp,bdp) bhv_remove(bhp,bdp) -#define vn_bhv_lookup(bhp,ops) bhv_lookup(bhp,ops) -#define vn_bhv_lookup_unlocked(bhp,ops) bhv_lookup_unlocked(bhp,ops) - -/* - * Vnode to Linux inode mapping. - */ -static inline struct bhv_vnode *vn_from_inode(struct inode *inode) -{ - return container_of(inode, bhv_vnode_t, v_inode); -} -static inline struct inode *vn_to_inode(struct bhv_vnode *vnode) -{ - return &vnode->v_inode; -} - -/* - * Values for the vop_rwlock/rwunlock flags parameter. - */ -typedef enum bhv_vrwlock { - VRWLOCK_NONE, - VRWLOCK_READ, - VRWLOCK_WRITE, - VRWLOCK_WRITE_DIRECT, - VRWLOCK_TRY_READ, - VRWLOCK_TRY_WRITE -} bhv_vrwlock_t; - -/* - * Return values for bhv_vop_inactive. A return value of - * VN_INACTIVE_NOCACHE implies that the file system behavior - * has disassociated its state and bhv_desc_t from the vnode. - */ -#define VN_INACTIVE_CACHE 0 -#define VN_INACTIVE_NOCACHE 1 - -/* - * Values for the cmd code given to vop_vnode_change. - */ -typedef enum bhv_vchange { - VCHANGE_FLAGS_FRLOCKS = 0, - VCHANGE_FLAGS_ENF_LOCKING = 1, - VCHANGE_FLAGS_TRUNCATED = 2, - VCHANGE_FLAGS_PAGE_DIRTY = 3, - VCHANGE_FLAGS_IOEXCL_COUNT = 4 -} bhv_vchange_t; - -typedef enum { L_FALSE, L_TRUE } lastclose_t; - -typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); -typedef int (*vop_close_t)(bhv_desc_t *, int, lastclose_t, struct cred *); -typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, - const struct iovec *, unsigned int, - loff_t *, int, struct cred *); -typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, - loff_t *, int, size_t, read_actor_t, - void *, struct cred *); -typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, - struct pipe_inode_info *, size_t, int, int, - struct cred *); -typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, - struct file *, loff_t *, size_t, int, int, - struct cred *); -typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, - int, unsigned int, void __user *); -typedef int (*vop_getattr_t)(bhv_desc_t *, struct bhv_vattr *, int, - struct cred *); -typedef int (*vop_setattr_t)(bhv_desc_t *, struct bhv_vattr *, int, - struct cred *); -typedef int (*vop_access_t)(bhv_desc_t *, int, struct cred *); -typedef int (*vop_lookup_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t **, - int, bhv_vnode_t *, struct cred *); -typedef int (*vop_create_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *, - bhv_vnode_t **, struct cred *); -typedef int (*vop_remove_t)(bhv_desc_t *, bhv_vname_t *, struct cred *); -typedef int (*vop_link_t)(bhv_desc_t *, bhv_vnode_t *, bhv_vname_t *, - struct cred *); -typedef int (*vop_rename_t)(bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *, - bhv_vname_t *, struct cred *); -typedef int (*vop_mkdir_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr *, - bhv_vnode_t **, struct cred *); -typedef int (*vop_rmdir_t)(bhv_desc_t *, bhv_vname_t *, struct cred *); -typedef int (*vop_readdir_t)(bhv_desc_t *, struct uio *, struct cred *, - int *); -typedef int (*vop_symlink_t)(bhv_desc_t *, bhv_vname_t *, struct bhv_vattr*, - char *, bhv_vnode_t **, struct cred *); -typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int, - struct cred *); -typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, - xfs_off_t, xfs_off_t); -typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); -typedef int (*vop_fid2_t)(bhv_desc_t *, struct fid *); -typedef int (*vop_release_t)(bhv_desc_t *); -typedef int (*vop_rwlock_t)(bhv_desc_t *, bhv_vrwlock_t); -typedef void (*vop_rwunlock_t)(bhv_desc_t *, bhv_vrwlock_t); -typedef int (*vop_bmap_t)(bhv_desc_t *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -typedef int (*vop_reclaim_t)(bhv_desc_t *); -typedef int (*vop_attr_get_t)(bhv_desc_t *, const char *, char *, int *, - int, struct cred *); -typedef int (*vop_attr_set_t)(bhv_desc_t *, const char *, char *, int, - int, struct cred *); -typedef int (*vop_attr_remove_t)(bhv_desc_t *, const char *, - int, struct cred *); -typedef int (*vop_attr_list_t)(bhv_desc_t *, char *, int, int, - struct attrlist_cursor_kern *, struct cred *); -typedef void (*vop_link_removed_t)(bhv_desc_t *, bhv_vnode_t *, int); -typedef void (*vop_vnode_change_t)(bhv_desc_t *, bhv_vchange_t, __psint_t); -typedef void (*vop_ptossvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -typedef void (*vop_pflushinvalvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, int); -typedef int (*vop_pflushvp_t)(bhv_desc_t *, xfs_off_t, xfs_off_t, - uint64_t, int); -typedef int (*vop_iflush_t)(bhv_desc_t *, int); - - -typedef struct bhv_vnodeops { - bhv_position_t vn_position; /* position within behavior chain */ - vop_open_t vop_open; - vop_close_t vop_close; - vop_read_t vop_read; - vop_write_t vop_write; - vop_sendfile_t vop_sendfile; - vop_splice_read_t vop_splice_read; - vop_splice_write_t vop_splice_write; - vop_ioctl_t vop_ioctl; - vop_getattr_t vop_getattr; - vop_setattr_t vop_setattr; - vop_access_t vop_access; - vop_lookup_t vop_lookup; - vop_create_t vop_create; - vop_remove_t vop_remove; - vop_link_t vop_link; - vop_rename_t vop_rename; - vop_mkdir_t vop_mkdir; - vop_rmdir_t vop_rmdir; - vop_readdir_t vop_readdir; - vop_symlink_t vop_symlink; - vop_readlink_t vop_readlink; - vop_fsync_t vop_fsync; - vop_inactive_t vop_inactive; - vop_fid2_t vop_fid2; - vop_rwlock_t vop_rwlock; - vop_rwunlock_t vop_rwunlock; - vop_bmap_t vop_bmap; - vop_reclaim_t vop_reclaim; - vop_attr_get_t vop_attr_get; - vop_attr_set_t vop_attr_set; - vop_attr_remove_t vop_attr_remove; - vop_attr_list_t vop_attr_list; - vop_link_removed_t vop_link_removed; - vop_vnode_change_t vop_vnode_change; - vop_ptossvp_t vop_tosspages; - vop_pflushinvalvp_t vop_flushinval_pages; - vop_pflushvp_t vop_flush_pages; - vop_release_t vop_release; - vop_iflush_t vop_iflush; -} bhv_vnodeops_t; - -/* - * Virtual node operations, operating from head bhv. - */ -#define VNHEAD(vp) ((vp)->v_bh.bh_first) -#define VOP(op, vp) (*((bhv_vnodeops_t *)VNHEAD(vp)->bd_ops)->op) -#define bhv_vop_open(vp, cr) VOP(vop_open, vp)(VNHEAD(vp),cr) -#define bhv_vop_close(vp, f,last,cr) VOP(vop_close, vp)(VNHEAD(vp),f,last,cr) -#define bhv_vop_read(vp,file,iov,segs,offset,ioflags,cr) \ - VOP(vop_read, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) -#define bhv_vop_write(vp,file,iov,segs,offset,ioflags,cr) \ - VOP(vop_write, vp)(VNHEAD(vp),file,iov,segs,offset,ioflags,cr) -#define bhv_vop_sendfile(vp,f,off,ioflags,cnt,act,targ,cr) \ - VOP(vop_sendfile, vp)(VNHEAD(vp),f,off,ioflags,cnt,act,targ,cr) -#define bhv_vop_splice_read(vp,f,o,pipe,cnt,fl,iofl,cr) \ - VOP(vop_splice_read, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) -#define bhv_vop_splice_write(vp,f,o,pipe,cnt,fl,iofl,cr) \ - VOP(vop_splice_write, vp)(VNHEAD(vp),f,o,pipe,cnt,fl,iofl,cr) -#define bhv_vop_bmap(vp,of,sz,rw,b,n) \ - VOP(vop_bmap, vp)(VNHEAD(vp),of,sz,rw,b,n) -#define bhv_vop_getattr(vp, vap,f,cr) \ - VOP(vop_getattr, vp)(VNHEAD(vp), vap,f,cr) -#define bhv_vop_setattr(vp, vap,f,cr) \ - VOP(vop_setattr, vp)(VNHEAD(vp), vap,f,cr) -#define bhv_vop_access(vp, mode,cr) VOP(vop_access, vp)(VNHEAD(vp), mode,cr) -#define bhv_vop_lookup(vp,d,vpp,f,rdir,cr) \ - VOP(vop_lookup, vp)(VNHEAD(vp),d,vpp,f,rdir,cr) -#define bhv_vop_create(dvp,d,vap,vpp,cr) \ - VOP(vop_create, dvp)(VNHEAD(dvp),d,vap,vpp,cr) -#define bhv_vop_remove(dvp,d,cr) VOP(vop_remove, dvp)(VNHEAD(dvp),d,cr) -#define bhv_vop_link(dvp,fvp,d,cr) VOP(vop_link, dvp)(VNHEAD(dvp),fvp,d,cr) -#define bhv_vop_rename(fvp,fnm,tdvp,tnm,cr) \ - VOP(vop_rename, fvp)(VNHEAD(fvp),fnm,tdvp,tnm,cr) -#define bhv_vop_mkdir(dp,d,vap,vpp,cr) \ - VOP(vop_mkdir, dp)(VNHEAD(dp),d,vap,vpp,cr) -#define bhv_vop_rmdir(dp,d,cr) VOP(vop_rmdir, dp)(VNHEAD(dp),d,cr) -#define bhv_vop_readdir(vp,uiop,cr,eofp) \ - VOP(vop_readdir, vp)(VNHEAD(vp),uiop,cr,eofp) -#define bhv_vop_symlink(dvp,d,vap,tnm,vpp,cr) \ - VOP(vop_symlink, dvp)(VNHEAD(dvp),d,vap,tnm,vpp,cr) -#define bhv_vop_readlink(vp,uiop,fl,cr) \ - VOP(vop_readlink, vp)(VNHEAD(vp),uiop,fl,cr) -#define bhv_vop_fsync(vp,f,cr,b,e) VOP(vop_fsync, vp)(VNHEAD(vp),f,cr,b,e) -#define bhv_vop_inactive(vp,cr) VOP(vop_inactive, vp)(VNHEAD(vp),cr) -#define bhv_vop_release(vp) VOP(vop_release, vp)(VNHEAD(vp)) -#define bhv_vop_fid2(vp,fidp) VOP(vop_fid2, vp)(VNHEAD(vp),fidp) -#define bhv_vop_rwlock(vp,i) VOP(vop_rwlock, vp)(VNHEAD(vp),i) -#define bhv_vop_rwlock_try(vp,i) VOP(vop_rwlock, vp)(VNHEAD(vp),i) -#define bhv_vop_rwunlock(vp,i) VOP(vop_rwunlock, vp)(VNHEAD(vp),i) -#define bhv_vop_frlock(vp,c,fl,flags,offset,fr) \ - VOP(vop_frlock, vp)(VNHEAD(vp),c,fl,flags,offset,fr) -#define bhv_vop_reclaim(vp) VOP(vop_reclaim, vp)(VNHEAD(vp)) -#define bhv_vop_attr_get(vp, name, val, vallenp, fl, cred) \ - VOP(vop_attr_get, vp)(VNHEAD(vp),name,val,vallenp,fl,cred) -#define bhv_vop_attr_set(vp, name, val, vallen, fl, cred) \ - VOP(vop_attr_set, vp)(VNHEAD(vp),name,val,vallen,fl,cred) -#define bhv_vop_attr_remove(vp, name, flags, cred) \ - VOP(vop_attr_remove, vp)(VNHEAD(vp),name,flags,cred) -#define bhv_vop_attr_list(vp, buf, buflen, fl, cursor, cred) \ - VOP(vop_attr_list, vp)(VNHEAD(vp),buf,buflen,fl,cursor,cred) -#define bhv_vop_link_removed(vp, dvp, linkzero) \ - VOP(vop_link_removed, vp)(VNHEAD(vp), dvp, linkzero) -#define bhv_vop_vnode_change(vp, cmd, val) \ - VOP(vop_vnode_change, vp)(VNHEAD(vp), cmd, val) -#define bhv_vop_toss_pages(vp, first, last, fiopt) \ - VOP(vop_tosspages, vp)(VNHEAD(vp), first, last, fiopt) -#define bhv_vop_flushinval_pages(vp, first, last, fiopt) \ - VOP(vop_flushinval_pages, vp)(VNHEAD(vp),first,last,fiopt) -#define bhv_vop_flush_pages(vp, first, last, flags, fiopt) \ - VOP(vop_flush_pages, vp)(VNHEAD(vp),first,last,flags,fiopt) -#define bhv_vop_ioctl(vp, inode, filp, fl, cmd, arg) \ - VOP(vop_ioctl, vp)(VNHEAD(vp),inode,filp,fl,cmd,arg) -#define bhv_vop_iflush(vp, flags) VOP(vop_iflush, vp)(VNHEAD(vp), flags) - -/* - * Flags for read/write calls - same values as IRIX - */ -#define IO_ISAIO 0x00001 /* don't wait for completion */ -#define IO_ISDIRECT 0x00004 /* bypass page cache */ -#define IO_INVIS 0x00020 /* don't update inode timestamps */ - -/* - * Flags for vop_iflush call - */ -#define FLUSH_SYNC 1 /* wait for flush to complete */ -#define FLUSH_INODE 2 /* flush the inode itself */ -#define FLUSH_LOG 4 /* force the last log entry for - * this inode out to disk */ - -/* - * Flush/Invalidate options for vop_toss/flush/flushinval_pages. - */ -#define FI_NONE 0 /* none */ -#define FI_REMAPF 1 /* Do a remapf prior to the operation */ -#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation. - Prevent VM access to the pages until - the operation completes. */ - -/* - * Vnode attributes. va_mask indicates those attributes the caller - * wants to set or extract. - */ -typedef struct bhv_vattr { - int va_mask; /* bit-mask of attributes present */ - mode_t va_mode; /* file access mode and type */ - xfs_nlink_t va_nlink; /* number of references to file */ - uid_t va_uid; /* owner user id */ - gid_t va_gid; /* owner group id */ - xfs_ino_t va_nodeid; /* file id */ - xfs_off_t va_size; /* file size in bytes */ - u_long va_blocksize; /* blocksize preferred for i/o */ - struct timespec va_atime; /* time of last access */ - struct timespec va_mtime; /* time of last modification */ - struct timespec va_ctime; /* time file changed */ - u_int va_gen; /* generation number of file */ - xfs_dev_t va_rdev; /* device the special file represents */ - __int64_t va_nblocks; /* number of blocks allocated */ - u_long va_xflags; /* random extended file flags */ - u_long va_extsize; /* file extent size */ - u_long va_nextents; /* number of extents in file */ - u_long va_anextents; /* number of attr extents in file */ - prid_t va_projid; /* project id */ -} bhv_vattr_t; - -/* - * setattr or getattr attributes - */ -#define XFS_AT_TYPE 0x00000001 -#define XFS_AT_MODE 0x00000002 -#define XFS_AT_UID 0x00000004 -#define XFS_AT_GID 0x00000008 -#define XFS_AT_FSID 0x00000010 -#define XFS_AT_NODEID 0x00000020 -#define XFS_AT_NLINK 0x00000040 -#define XFS_AT_SIZE 0x00000080 -#define XFS_AT_ATIME 0x00000100 -#define XFS_AT_MTIME 0x00000200 -#define XFS_AT_CTIME 0x00000400 -#define XFS_AT_RDEV 0x00000800 -#define XFS_AT_BLKSIZE 0x00001000 -#define XFS_AT_NBLOCKS 0x00002000 -#define XFS_AT_VCODE 0x00004000 -#define XFS_AT_MAC 0x00008000 -#define XFS_AT_UPDATIME 0x00010000 -#define XFS_AT_UPDMTIME 0x00020000 -#define XFS_AT_UPDCTIME 0x00040000 -#define XFS_AT_ACL 0x00080000 -#define XFS_AT_CAP 0x00100000 -#define XFS_AT_INF 0x00200000 -#define XFS_AT_XFLAGS 0x00400000 -#define XFS_AT_EXTSIZE 0x00800000 -#define XFS_AT_NEXTENTS 0x01000000 -#define XFS_AT_ANEXTENTS 0x02000000 -#define XFS_AT_PROJID 0x04000000 -#define XFS_AT_SIZE_NOPERM 0x08000000 -#define XFS_AT_GENCOUNT 0x10000000 - -#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\ - XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT) - -#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\ - XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\ - XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\ - XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID) - -#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME) - -#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME) - -#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\ - XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ - XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) - -/* - * Modes. - */ -#define VSUID S_ISUID /* set user id on execution */ -#define VSGID S_ISGID /* set group id on execution */ -#define VSVTX S_ISVTX /* save swapped text even after use */ -#define VREAD S_IRUSR /* read, write, execute permissions */ -#define VWRITE S_IWUSR -#define VEXEC S_IXUSR - -#define MODEMASK S_IALLUGO /* mode bits plus permission bits */ - -/* - * Check whether mandatory file locking is enabled. - */ -#define MANDLOCK(vp, mode) \ - (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) - -extern void vn_init(void); -extern bhv_vnode_t *vn_initialize(struct inode *); -extern int vn_revalidate(struct bhv_vnode *); -extern int __vn_revalidate(struct bhv_vnode *, bhv_vattr_t *); -extern void vn_revalidate_core(struct bhv_vnode *, bhv_vattr_t *); - -extern void vn_iowait(struct bhv_vnode *vp); -extern void vn_iowake(struct bhv_vnode *vp); - -extern void vn_ioerror(struct bhv_vnode *vp, int error, char *f, int l); - -static inline int vn_count(struct bhv_vnode *vp) -{ - return atomic_read(&vn_to_inode(vp)->i_count); -} - -/* - * Vnode reference counting functions (and macros for compatibility). - */ -extern bhv_vnode_t *vn_hold(struct bhv_vnode *); - -#if defined(XFS_VNODE_TRACE) -#define VN_HOLD(vp) \ - ((void)vn_hold(vp), \ - vn_trace_hold(vp, __FILE__, __LINE__, (inst_t *)__return_address)) -#define VN_RELE(vp) \ - (vn_trace_rele(vp, __FILE__, __LINE__, (inst_t *)__return_address), \ - iput(vn_to_inode(vp))) -#else -#define VN_HOLD(vp) ((void)vn_hold(vp)) -#define VN_RELE(vp) (iput(vn_to_inode(vp))) -#endif - -static inline struct bhv_vnode *vn_grab(struct bhv_vnode *vp) -{ - struct inode *inode = igrab(vn_to_inode(vp)); - return inode ? vn_from_inode(inode) : NULL; -} - -/* - * Vname handling macros. - */ -#define VNAME(dentry) ((char *) (dentry)->d_name.name) -#define VNAMELEN(dentry) ((dentry)->d_name.len) -#define VNAME_TO_VNODE(dentry) (vn_from_inode((dentry)->d_inode)) - -/* - * Vnode spinlock manipulation. - */ -#define VN_LOCK(vp) mutex_spinlock(&(vp)->v_lock) -#define VN_UNLOCK(vp, s) mutex_spinunlock(&(vp)->v_lock, s) - -static __inline__ void vn_flagset(struct bhv_vnode *vp, uint flag) -{ - spin_lock(&vp->v_lock); - vp->v_flag |= flag; - spin_unlock(&vp->v_lock); -} - -static __inline__ uint vn_flagclr(struct bhv_vnode *vp, uint flag) -{ - uint cleared; - - spin_lock(&vp->v_lock); - cleared = (vp->v_flag & flag); - vp->v_flag &= ~flag; - spin_unlock(&vp->v_lock); - return cleared; -} - -#define VMODIFY(vp) vn_flagset(vp, VMODIFIED) -#define VUNMODIFY(vp) vn_flagclr(vp, VMODIFIED) -#define VTRUNCATE(vp) vn_flagset(vp, VTRUNCATED) -#define VUNTRUNCATE(vp) vn_flagclr(vp, VTRUNCATED) - -/* - * Dealing with bad inodes - */ -static inline void vn_mark_bad(struct bhv_vnode *vp) -{ - make_bad_inode(vn_to_inode(vp)); -} - -static inline int VN_BAD(struct bhv_vnode *vp) -{ - return is_bad_inode(vn_to_inode(vp)); -} - -/* - * Extracting atime values in various formats - */ -static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime) -{ - bs_atime->tv_sec = vp->v_inode.i_atime.tv_sec; - bs_atime->tv_nsec = vp->v_inode.i_atime.tv_nsec; -} - -static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts) -{ - *ts = vp->v_inode.i_atime; -} - -static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt) -{ - *tt = vp->v_inode.i_atime.tv_sec; -} - -/* - * Some useful predicates. - */ -#define VN_MAPPED(vp) mapping_mapped(vn_to_inode(vp)->i_mapping) -#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages) -#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \ - PAGECACHE_TAG_DIRTY) -#define VN_TRUNC(vp) ((vp)->v_flag & VTRUNCATED) - -/* - * Flags to vop_setattr/getattr. - */ -#define ATTR_UTIME 0x01 /* non-default utime(2) request */ -#define ATTR_DMI 0x08 /* invocation from a DMI function */ -#define ATTR_LAZY 0x80 /* set/get attributes lazily */ -#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */ -#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */ -#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */ - -/* - * Flags to vop_fsync/reclaim. - */ -#define FSYNC_NOWAIT 0 /* asynchronous flush */ -#define FSYNC_WAIT 0x1 /* synchronous fsync or forced reclaim */ -#define FSYNC_INVAL 0x2 /* flush and invalidate cached data */ -#define FSYNC_DATA 0x4 /* synchronous fsync of data only */ - -/* - * Tracking vnode activity. - */ -#if defined(XFS_VNODE_TRACE) - -#define VNODE_TRACE_SIZE 16 /* number of trace entries */ -#define VNODE_KTRACE_ENTRY 1 -#define VNODE_KTRACE_EXIT 2 -#define VNODE_KTRACE_HOLD 3 -#define VNODE_KTRACE_REF 4 -#define VNODE_KTRACE_RELE 5 - -extern void vn_trace_entry(struct bhv_vnode *, const char *, inst_t *); -extern void vn_trace_exit(struct bhv_vnode *, const char *, inst_t *); -extern void vn_trace_hold(struct bhv_vnode *, char *, int, inst_t *); -extern void vn_trace_ref(struct bhv_vnode *, char *, int, inst_t *); -extern void vn_trace_rele(struct bhv_vnode *, char *, int, inst_t *); - -#define VN_TRACE(vp) \ - vn_trace_ref(vp, __FILE__, __LINE__, (inst_t *)__return_address) -#else -#define vn_trace_entry(a,b,c) -#define vn_trace_exit(a,b,c) -#define vn_trace_hold(a,b,c,d) -#define vn_trace_ref(a,b,c,d) -#define vn_trace_rele(a,b,c,d) -#define VN_TRACE(vp) -#endif - -#endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/mrlock.h index 32e1ce0f04c..e3c92d19e54 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/mrlock.h @@ -20,29 +20,35 @@ #include <linux/rwsem.h> -enum { MR_NONE, MR_ACCESS, MR_UPDATE }; - typedef struct { struct rw_semaphore mr_lock; +#if defined(DEBUG) || defined(XFS_WARN) int mr_writer; +#endif } mrlock_t; +#if defined(DEBUG) || defined(XFS_WARN) #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) -#define mraccess(mrp) mraccessf(mrp, 0) -#define mrupdate(mrp) mrupdatef(mrp, 0) -static inline void mraccessf(mrlock_t *mrp, int flags) +static inline void mraccess_nested(mrlock_t *mrp, int subclass) { - down_read(&mrp->mr_lock); + down_read_nested(&mrp->mr_lock, subclass); } -static inline void mrupdatef(mrlock_t *mrp, int flags) +static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { - down_write(&mrp->mr_lock); + down_write_nested(&mrp->mr_lock, subclass); +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; +#endif } static inline int mrtryaccess(mrlock_t *mrp) @@ -54,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; +#if defined(DEBUG) || defined(XFS_WARN) mrp->mr_writer = 1; +#endif return 1; } -static inline void mrunlock(mrlock_t *mrp) +static inline void mrunlock_excl(mrlock_t *mrp) { - if (mrp->mr_writer) { - mrp->mr_writer = 0; - up_write(&mrp->mr_lock); - } else { - up_read(&mrp->mr_lock); - } +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); } -static inline void mrdemote(mrlock_t *mrp) +static inline void mrunlock_shared(mrlock_t *mrp) { - mrp->mr_writer = 0; - downgrade_write(&mrp->mr_lock); + up_read(&mrp->mr_lock); } -#ifdef DEBUG -/* - * Debug-only routine, without some platform-specific asm code, we can - * now only answer requests regarding whether we hold the lock for write - * (reader state is outside our visibility, we only track writer state). - * Note: means !ismrlocked would give false positives, so don't do that. - */ -static inline int ismrlocked(mrlock_t *mrp, int type) +static inline void mrdemote(mrlock_t *mrp) { - if (mrp && type == MR_UPDATE) - return mrp->mr_writer; - return 1; -} +#if defined(DEBUG) || defined(XFS_WARN) + mrp->mr_writer = 0; #endif + downgrade_write(&mrp->mr_lock); +} #endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c deleted file mode 100644 index 3aa77153185..00000000000 --- a/fs/xfs/quota/xfs_dquot.c +++ /dev/null @@ -1,1599 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_trans_space.h" -#include "xfs_trans_priv.h" -#include "xfs_qm.h" - - -/* - LOCK ORDER - - inode lock (ilock) - dquot hash-chain lock (hashlock) - xqm dquot freelist lock (freelistlock - mount's dquot list lock (mplistlock) - user dquot lock - lock ordering among dquots is based on the uid or gid - group dquot lock - similar to udquots. Between the two dquots, the udquot - has to be locked first. - pin lock - the dquot lock must be held to take this lock. - flush lock - ditto. -*/ - -STATIC void xfs_qm_dqflush_done(xfs_buf_t *, xfs_dq_logitem_t *); - -#ifdef DEBUG -xfs_buftarg_t *xfs_dqerror_target; -int xfs_do_dqerror; -int xfs_dqreq_num; -int xfs_dqerror_mod = 33; -#endif - -/* - * Allocate and initialize a dquot. We don't always allocate fresh memory; - * we try to reclaim a free dquot if the number of incore dquots are above - * a threshold. - * The only field inside the core that gets initialized at this point - * is the d_id field. The idea is to fill in the entire q_core - * when we read in the on disk dquot. - */ -STATIC xfs_dquot_t * -xfs_qm_dqinit( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type) -{ - xfs_dquot_t *dqp; - boolean_t brandnewdquot; - - brandnewdquot = xfs_qm_dqalloc_incore(&dqp); - dqp->dq_flags = type; - dqp->q_core.d_id = cpu_to_be32(id); - dqp->q_mount = mp; - - /* - * No need to re-initialize these if this is a reclaimed dquot. - */ - if (brandnewdquot) { - dqp->dq_flnext = dqp->dq_flprev = dqp; - mutex_init(&dqp->q_qlock); - initnsema(&dqp->q_flock, 1, "fdq"); - sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq"); - -#ifdef XFS_DQUOT_TRACE - dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP); - xfs_dqtrace_entry(dqp, "DQINIT"); -#endif - } else { - /* - * Only the q_core portion was zeroed in dqreclaim_one(). - * So, we need to reset others. - */ - dqp->q_nrefs = 0; - dqp->q_blkno = 0; - dqp->MPL_NEXT = dqp->HL_NEXT = NULL; - dqp->HL_PREVP = dqp->MPL_PREVP = NULL; - dqp->q_bufoffset = 0; - dqp->q_fileoffset = 0; - dqp->q_transp = NULL; - dqp->q_gdquot = NULL; - dqp->q_res_bcount = 0; - dqp->q_res_icount = 0; - dqp->q_res_rtbcount = 0; - dqp->q_pincount = 0; - dqp->q_hash = NULL; - ASSERT(dqp->dq_flnext == dqp->dq_flprev); - -#ifdef XFS_DQUOT_TRACE - ASSERT(dqp->q_trace); - xfs_dqtrace_entry(dqp, "DQRECLAIMED_INIT"); -#endif - } - - /* - * log item gets initialized later - */ - return (dqp); -} - -/* - * This is called to free all the memory associated with a dquot - */ -void -xfs_qm_dqdestroy( - xfs_dquot_t *dqp) -{ - ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp)); - - mutex_destroy(&dqp->q_qlock); - freesema(&dqp->q_flock); - sv_destroy(&dqp->q_pinwait); - -#ifdef XFS_DQUOT_TRACE - if (dqp->q_trace) - ktrace_free(dqp->q_trace); - dqp->q_trace = NULL; -#endif - kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); - atomic_dec(&xfs_Gqm->qm_totaldquots); -} - -/* - * This is what a 'fresh' dquot inside a dquot chunk looks like on disk. - */ -STATIC void -xfs_qm_dqinit_core( - xfs_dqid_t id, - uint type, - xfs_dqblk_t *d) -{ - /* - * Caller has zero'd the entire dquot 'chunk' already. - */ - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_id = cpu_to_be32(id); - d->dd_diskdq.d_flags = type; -} - - -#ifdef XFS_DQUOT_TRACE -/* - * Dquot tracing for debugging. - */ -/* ARGSUSED */ -void -__xfs_dqtrace_entry( - xfs_dquot_t *dqp, - char *func, - void *retaddr, - xfs_inode_t *ip) -{ - xfs_dquot_t *udqp = NULL; - xfs_ino_t ino = 0; - - ASSERT(dqp->q_trace); - if (ip) { - ino = ip->i_ino; - udqp = ip->i_udquot; - } - ktrace_enter(dqp->q_trace, - (void *)(__psint_t)DQUOT_KTRACE_ENTRY, - (void *)func, - (void *)(__psint_t)dqp->q_nrefs, - (void *)(__psint_t)dqp->dq_flags, - (void *)(__psint_t)dqp->q_res_bcount, - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_bcount), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_icount), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_hardlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_blk_softlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_hardlimit), - (void *)(__psint_t)be64_to_cpu(dqp->q_core.d_ino_softlimit), - (void *)(__psint_t)be32_to_cpu(dqp->q_core.d_id), - (void *)(__psint_t)current_pid(), - (void *)(__psint_t)ino, - (void *)(__psint_t)retaddr, - (void *)(__psint_t)udqp); - return; -} -#endif - - -/* - * If default limits are in force, push them into the dquot now. - * We overwrite the dquot limits only if they are zero and this - * is not the root dquot. - */ -void -xfs_qm_adjust_dqlimits( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) -{ - xfs_quotainfo_t *q = mp->m_quotainfo; - - ASSERT(d->d_id); - - if (q->qi_bsoftlimit && !d->d_blk_softlimit) - d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); - if (q->qi_bhardlimit && !d->d_blk_hardlimit) - d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); - if (q->qi_isoftlimit && !d->d_ino_softlimit) - d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); - if (q->qi_ihardlimit && !d->d_ino_hardlimit) - d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); - if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) - d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); - if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) - d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); -} - -/* - * Check the limits and timers of a dquot and start or reset timers - * if necessary. - * This gets called even when quota enforcement is OFF, which makes our - * life a little less complicated. (We just don't reject any quota - * reservations in that case, when enforcement is off). - * We also return 0 as the values of the timers in Q_GETQUOTA calls, when - * enforcement's off. - * In contrast, warnings are a little different in that they don't - * 'automatically' get started when limits get exceeded. They do - * get reset to zero, however, when we find the count to be under - * the soft limit (they are only ever set non-zero via userspace). - */ -void -xfs_qm_adjust_dqtimers( - xfs_mount_t *mp, - xfs_disk_dquot_t *d) -{ - ASSERT(d->d_id); - -#ifdef QUOTADEBUG - if (d->d_blk_hardlimit) - ASSERT(be64_to_cpu(d->d_blk_softlimit) <= - be64_to_cpu(d->d_blk_hardlimit)); - if (d->d_ino_hardlimit) - ASSERT(be64_to_cpu(d->d_ino_softlimit) <= - be64_to_cpu(d->d_ino_hardlimit)); - if (d->d_rtb_hardlimit) - ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= - be64_to_cpu(d->d_rtb_hardlimit)); -#endif - if (!d->d_btimer) { - if ((d->d_blk_softlimit && - (be64_to_cpu(d->d_bcount) >= - be64_to_cpu(d->d_blk_softlimit))) || - (d->d_blk_hardlimit && - (be64_to_cpu(d->d_bcount) >= - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = cpu_to_be32(get_seconds() + - XFS_QI_BTIMELIMIT(mp)); - } else { - d->d_bwarns = 0; - } - } else { - if ((!d->d_blk_softlimit || - (be64_to_cpu(d->d_bcount) < - be64_to_cpu(d->d_blk_softlimit))) && - (!d->d_blk_hardlimit || - (be64_to_cpu(d->d_bcount) < - be64_to_cpu(d->d_blk_hardlimit)))) { - d->d_btimer = 0; - } - } - - if (!d->d_itimer) { - if ((d->d_ino_softlimit && - (be64_to_cpu(d->d_icount) >= - be64_to_cpu(d->d_ino_softlimit))) || - (d->d_ino_hardlimit && - (be64_to_cpu(d->d_icount) >= - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = cpu_to_be32(get_seconds() + - XFS_QI_ITIMELIMIT(mp)); - } else { - d->d_iwarns = 0; - } - } else { - if ((!d->d_ino_softlimit || - (be64_to_cpu(d->d_icount) < - be64_to_cpu(d->d_ino_softlimit))) && - (!d->d_ino_hardlimit || - (be64_to_cpu(d->d_icount) < - be64_to_cpu(d->d_ino_hardlimit)))) { - d->d_itimer = 0; - } - } - - if (!d->d_rtbtimer) { - if ((d->d_rtb_softlimit && - (be64_to_cpu(d->d_rtbcount) >= - be64_to_cpu(d->d_rtb_softlimit))) || - (d->d_rtb_hardlimit && - (be64_to_cpu(d->d_rtbcount) >= - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = cpu_to_be32(get_seconds() + - XFS_QI_RTBTIMELIMIT(mp)); - } else { - d->d_rtbwarns = 0; - } - } else { - if ((!d->d_rtb_softlimit || - (be64_to_cpu(d->d_rtbcount) < - be64_to_cpu(d->d_rtb_softlimit))) && - (!d->d_rtb_hardlimit || - (be64_to_cpu(d->d_rtbcount) < - be64_to_cpu(d->d_rtb_hardlimit)))) { - d->d_rtbtimer = 0; - } - } -} - -/* - * initialize a buffer full of dquots and log the whole thing - */ -STATIC void -xfs_qm_init_dquot_blk( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_buf_t *bp) -{ - xfs_dqblk_t *d; - int curid, i; - - ASSERT(tp); - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - - d = (xfs_dqblk_t *)XFS_BUF_PTR(bp); - - /* - * ID of the first dquot in the block - id's are zero based. - */ - curid = id - (id % XFS_QM_DQPERBLK(mp)); - ASSERT(curid >= 0); - memset(d, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp))); - for (i = 0; i < XFS_QM_DQPERBLK(mp); i++, d++, curid++) - xfs_qm_dqinit_core(curid, type, d); - xfs_trans_dquot_buf(tp, bp, - (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : - ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : - XFS_BLI_GDQUOT_BUF))); - xfs_trans_log_buf(tp, bp, 0, BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1); -} - - - -/* - * Allocate a block and fill it with dquots. - * This is called when the bmapi finds a hole. - */ -STATIC int -xfs_qm_dqalloc( - xfs_trans_t **tpp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - xfs_inode_t *quotip, - xfs_fileoff_t offset_fsb, - xfs_buf_t **O_bpp) -{ - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - xfs_bmbt_irec_t map; - int nmaps, error, committed; - xfs_buf_t *bp; - xfs_trans_t *tp = *tpp; - - ASSERT(tp != NULL); - xfs_dqtrace_entry(dqp, "DQALLOC"); - - /* - * Initialize the bmap freelist prior to calling bmapi code. - */ - XFS_BMAP_INIT(&flist, &firstblock); - xfs_ilock(quotip, XFS_ILOCK_EXCL); - /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock - */ - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - return (ESRCH); - } - - /* - * xfs_trans_commit normally decrements the vnode ref count - * when it unlocks the inode. Since we want to keep the quota - * inode around, we bump the vnode ref count now. - */ - VN_HOLD(XFS_ITOV(quotip)); - - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - nmaps = 1; - if ((error = xfs_bmapi(tp, quotip, - offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA | XFS_BMAPI_WRITE, - &firstblock, - XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &flist, NULL))) { - goto error0; - } - ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(nmaps == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - /* - * Keep track of the blkno to save a lookup later - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - - /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0); - if (!bp || (error = XFS_BUF_GETERROR(bp))) - goto error1; - /* - * Make a chunk of dquots out of this buffer and log - * the entire thing. - */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), - dqp->dq_flags & XFS_DQ_ALLTYPES, bp); - - /* - * xfs_bmap_finish() may commit the current transaction and - * start a second transaction if the freelist is not empty. - * - * Since we still want to modify this buffer, we need to - * ensure that the buffer is not released on commit of - * the first transaction and ensure the buffer is added to the - * second transaction. - * - * If there is only one transaction then don't stop the buffer - * from being released when it commits later on. - */ - - xfs_trans_bhold(tp, bp); - - if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) { - goto error1; - } - - if (committed) { - tp = *tpp; - xfs_trans_bjoin(tp, bp); - } else { - xfs_trans_bhold_release(tp, bp); - } - - *O_bpp = bp; - return 0; - - error1: - xfs_bmap_cancel(&flist); - error0: - xfs_iunlock(quotip, XFS_ILOCK_EXCL); - - return (error); -} - -/* - * Maps a dquot to the buffer containing its on-disk version. - * This returns a ptr to the buffer containing the on-disk dquot - * in the bpp param, and a ptr to the on-disk dquot within that buffer - */ -STATIC int -xfs_qm_dqtobp( - xfs_trans_t **tpp, - xfs_dquot_t *dqp, - xfs_disk_dquot_t **O_ddpp, - xfs_buf_t **O_bpp, - uint flags) -{ - xfs_bmbt_irec_t map; - int nmaps, error; - xfs_buf_t *bp; - xfs_inode_t *quotip; - xfs_mount_t *mp; - xfs_disk_dquot_t *ddq; - xfs_dqid_t id; - boolean_t newdquot; - xfs_trans_t *tp = (tpp ? *tpp : NULL); - - mp = dqp->q_mount; - id = be32_to_cpu(dqp->q_core.d_id); - nmaps = 1; - newdquot = B_FALSE; - - /* - * If we don't know where the dquot lives, find out. - */ - if (dqp->q_blkno == (xfs_daddr_t) 0) { - /* We use the id as an index */ - dqp->q_fileoffset = (xfs_fileoff_t)id / XFS_QM_DQPERBLK(mp); - nmaps = 1; - quotip = XFS_DQ_TO_QIP(dqp); - xfs_ilock(quotip, XFS_ILOCK_SHARED); - /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock - */ - if (XFS_IS_THIS_QUOTA_OFF(dqp)) { - xfs_iunlock(quotip, XFS_ILOCK_SHARED); - return (ESRCH); - } - /* - * Find the block map; no allocations yet - */ - error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, - XFS_BMAPI_METADATA, - NULL, 0, &map, &nmaps, NULL, NULL); - - xfs_iunlock(quotip, XFS_ILOCK_SHARED); - if (error) - return (error); - ASSERT(nmaps == 1); - ASSERT(map.br_blockcount == 1); - - /* - * offset of dquot in the (fixed sized) dquot chunk. - */ - dqp->q_bufoffset = (id % XFS_QM_DQPERBLK(mp)) * - sizeof(xfs_dqblk_t); - if (map.br_startblock == HOLESTARTBLOCK) { - /* - * We don't allocate unless we're asked to - */ - if (!(flags & XFS_QMOPT_DQALLOC)) - return (ENOENT); - - ASSERT(tp); - if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, - dqp->q_fileoffset, &bp))) - return (error); - tp = *tpp; - newdquot = B_TRUE; - } else { - /* - * store the blkno etc so that we don't have to do the - * mapping all the time - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - } - } - ASSERT(dqp->q_blkno != DELAYSTARTBLOCK); - ASSERT(dqp->q_blkno != HOLESTARTBLOCK); - - /* - * Read in the buffer, unless we've just done the allocation - * (in which case we already have the buf). - */ - if (! newdquot) { - xfs_dqtrace_entry(dqp, "DQTOBP READBUF"); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - XFS_QI_DQCHUNKLEN(mp), - 0, &bp))) { - return (error); - } - if (error || !bp) - return XFS_ERROR(error); - } - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - - /* - * calculate the location of the dquot inside the buffer. - */ - ddq = (xfs_disk_dquot_t *)((char *)XFS_BUF_PTR(bp) + dqp->q_bufoffset); - - /* - * A simple sanity check in case we got a corrupted dquot... - */ - if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, - flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), - "dqtobp")) { - if (!(flags & XFS_QMOPT_DQREPAIR)) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EIO); - } - XFS_BUF_BUSY(bp); /* We dirtied this */ - } - - *O_bpp = bp; - *O_ddpp = ddq; - - return (0); -} - - -/* - * Read in the ondisk dquot using dqtobp() then copy it to an incore version, - * and release the buffer immediately. - * - */ -/* ARGSUSED */ -STATIC int -xfs_qm_dqread( - xfs_trans_t **tpp, - xfs_dqid_t id, - xfs_dquot_t *dqp, /* dquot to get filled in */ - uint flags) -{ - xfs_disk_dquot_t *ddqp; - xfs_buf_t *bp; - int error; - xfs_trans_t *tp; - - ASSERT(tpp); - - /* - * get a pointer to the on-disk dquot and the buffer containing it - * dqp already knows its own type (GROUP/USER). - */ - xfs_dqtrace_entry(dqp, "DQREAD"); - if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) { - return (error); - } - tp = *tpp; - - /* copy everything from disk dquot to the incore dquot */ - memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - xfs_qm_dquot_logitem_init(dqp); - - /* - * Reservation counters are defined as reservation plus current usage - * to avoid having to add everytime. - */ - dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); - dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); - dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); - - /* Mark the buf so that this will stay incore a little longer */ - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF); - - /* - * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) - * So we need to release with xfs_trans_brelse(). - * The strategy here is identical to that of inodes; we lock - * the dquot in xfs_qm_dqget() before making it accessible to - * others. This is because dquots, like inodes, need a good level of - * concurrency, and we don't want to take locks on the entire buffers - * for dquot accesses. - * Note also that the dquot buffer may even be dirty at this point, if - * this particular dquot was repaired. We still aren't afraid to - * brelse it because we have the changes incore. - */ - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - xfs_trans_brelse(tp, bp); - - return (error); -} - - -/* - * allocate an incore dquot from the kernel heap, - * and fill its core with quota information kept on disk. - * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk - * if it wasn't already allocated. - */ -STATIC int -xfs_qm_idtodq( - xfs_mount_t *mp, - xfs_dqid_t id, /* gid or uid, depending on type */ - uint type, /* UDQUOT or GDQUOT */ - uint flags, /* DQALLOC, DQREPAIR */ - xfs_dquot_t **O_dqpp)/* OUT : incore dquot, not locked */ -{ - xfs_dquot_t *dqp; - int error; - xfs_trans_t *tp; - int cancelflags=0; - - dqp = xfs_qm_dqinit(mp, id, type); - tp = NULL; - if (flags & XFS_QMOPT_DQALLOC) { - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); - if ((error = xfs_trans_reserve(tp, - XFS_QM_DQALLOC_SPACE_RES(mp), - XFS_WRITE_LOG_RES(mp) + - BBTOB(XFS_QI_DQCHUNKLEN(mp)) - 1 + - 128, - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT))) { - cancelflags = 0; - goto error0; - } - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - } - - /* - * Read it from disk; xfs_dqread() takes care of - * all the necessary initialization of dquot's fields (locks, etc) - */ - if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - xfs_dqtrace_entry(dqp, "DQREAD FAIL"); - cancelflags |= XFS_TRANS_ABORT; - goto error0; - } - if (tp) { - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL))) - goto error1; - } - - *O_dqpp = dqp; - return (0); - - error0: - ASSERT(error); - if (tp) - xfs_trans_cancel(tp, cancelflags); - error1: - xfs_qm_dqdestroy(dqp); - *O_dqpp = NULL; - return (error); -} - -/* - * Lookup a dquot in the incore dquot hashtable. We keep two separate - * hashtables for user and group dquots; and, these are global tables - * inside the XQM, not per-filesystem tables. - * The hash chain must be locked by caller, and it is left locked - * on return. Returning dquot is locked. - */ -STATIC int -xfs_qm_dqlookup( - xfs_mount_t *mp, - xfs_dqid_t id, - xfs_dqhash_t *qh, - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - uint flist_locked; - xfs_dquot_t *d; - - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); - - flist_locked = B_FALSE; - - /* - * Traverse the hashchain looking for a match - */ - for (dqp = qh->qh_next; dqp != NULL; dqp = dqp->HL_NEXT) { - /* - * We already have the hashlock. We don't need the - * dqlock to look at the id field of the dquot, since the - * id can't be modified without the hashlock anyway. - */ - if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) { - xfs_dqtrace_entry(dqp, "DQFOUND BY LOOKUP"); - /* - * All in core dquots must be on the dqlist of mp - */ - ASSERT(dqp->MPL_PREVP != NULL); - - xfs_dqlock(dqp); - if (dqp->q_nrefs == 0) { - ASSERT (XFS_DQ_IS_ON_FREELIST(dqp)); - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { - xfs_dqtrace_entry(dqp, "DQLOOKUP: WANT"); - - /* - * We may have raced with dqreclaim_one() - * (and lost). So, flag that we don't - * want the dquot to be reclaimed. - */ - dqp->dq_flags |= XFS_DQ_WANT; - xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); - xfs_dqlock(dqp); - dqp->dq_flags &= ~(XFS_DQ_WANT); - } - flist_locked = B_TRUE; - } - - /* - * id couldn't have changed; we had the hashlock all - * along - */ - ASSERT(be32_to_cpu(dqp->q_core.d_id) == id); - - if (flist_locked) { - if (dqp->q_nrefs != 0) { - xfs_qm_freelist_unlock(xfs_Gqm); - flist_locked = B_FALSE; - } else { - /* - * take it off the freelist - */ - xfs_dqtrace_entry(dqp, - "DQLOOKUP: TAKEOFF FL"); - XQM_FREELIST_REMOVE(dqp); - /* xfs_qm_freelist_print(&(xfs_Gqm-> - qm_dqfreelist), - "after removal"); */ - } - } - - /* - * grab a reference - */ - XFS_DQHOLD(dqp); - - if (flist_locked) - xfs_qm_freelist_unlock(xfs_Gqm); - /* - * move the dquot to the front of the hashchain - */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); - if (dqp->HL_PREVP != &qh->qh_next) { - xfs_dqtrace_entry(dqp, - "DQLOOKUP: HASH MOVETOFRONT"); - if ((d = dqp->HL_NEXT)) - d->HL_PREVP = dqp->HL_PREVP; - *(dqp->HL_PREVP) = d; - d = qh->qh_next; - d->HL_PREVP = &dqp->HL_NEXT; - dqp->HL_NEXT = d; - dqp->HL_PREVP = &qh->qh_next; - qh->qh_next = dqp; - } - xfs_dqtrace_entry(dqp, "LOOKUP END"); - *O_dqpp = dqp; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); - return (0); - } - } - - *O_dqpp = NULL; - ASSERT(XFS_DQ_IS_HASH_LOCKED(qh)); - return (1); -} - -/* - * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a - * a locked dquot, doing an allocation (if requested) as needed. - * When both an inode and an id are given, the inode's id takes precedence. - * That is, if the id changes while we don't hold the ilock inside this - * function, the new dquot is returned, not necessarily the one requested - * in the id argument. - */ -int -xfs_qm_dqget( - xfs_mount_t *mp, - xfs_inode_t *ip, /* locked inode (optional) */ - xfs_dqid_t id, /* uid/projid/gid depending on type */ - uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ - uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ - xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ -{ - xfs_dquot_t *dqp; - xfs_dqhash_t *h; - uint version; - int error; - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || - (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || - (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { - return (ESRCH); - } - h = XFS_DQ_HASH(mp, id, type); - -#ifdef DEBUG - if (xfs_do_dqerror) { - if ((xfs_dqerror_target == mp->m_ddev_targp) && - (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { - cmn_err(CE_DEBUG, "Returning error in dqget"); - return (EIO); - } - } -#endif - - again: - -#ifdef DEBUG - ASSERT(type == XFS_DQ_USER || - type == XFS_DQ_PROJ || - type == XFS_DQ_GROUP); - if (ip) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - if (type == XFS_DQ_USER) - ASSERT(ip->i_udquot == NULL); - else - ASSERT(ip->i_gdquot == NULL); - } -#endif - XFS_DQ_HASH_LOCK(h); - - /* - * Look in the cache (hashtable). - * The chain is kept locked during lookup. - */ - if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) { - XQM_STATS_INC(xqmstats.xs_qm_dqcachehits); - /* - * The dquot was found, moved to the front of the chain, - * taken off the freelist if it was on it, and locked - * at this point. Just unlock the hashchain and return. - */ - ASSERT(*O_dqpp); - ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp)); - XFS_DQ_HASH_UNLOCK(h); - xfs_dqtrace_entry(*O_dqpp, "DQGET DONE (FROM CACHE)"); - return (0); /* success */ - } - XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses); - - /* - * Dquot cache miss. We don't want to keep the inode lock across - * a (potential) disk read. Also we don't want to deal with the lock - * ordering between quotainode and this inode. OTOH, dropping the inode - * lock here means dealing with a chown that can happen before - * we re-acquire the lock. - */ - if (ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * Save the hashchain version stamp, and unlock the chain, so that - * we don't keep the lock across a disk read - */ - version = h->qh_version; - XFS_DQ_HASH_UNLOCK(h); - - /* - * Allocate the dquot on the kernel heap, and read the ondisk - * portion off the disk. Also, do all the necessary initialization - * This can return ENOENT if dquot didn't exist on disk and we didn't - * ask it to allocate; ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_idtodq(mp, id, type, - flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR| - XFS_QMOPT_DOWARN), - &dqp))) { - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - return (error); - } - - /* - * See if this is mount code calling to look at the overall quota limits - * which are stored in the id == 0 user or group's dquot. - * Since we may not have done a quotacheck by this point, just return - * the dquot without attaching it to any hashtables, lists, etc, or even - * taking a reference. - * The caller must dqdestroy this once done. - */ - if (flags & XFS_QMOPT_DQSUSER) { - ASSERT(id == 0); - ASSERT(! ip); - goto dqret; - } - - /* - * Dquot lock comes after hashlock in the lock ordering - */ - if (ip) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (! XFS_IS_DQTYPE_ON(mp, type)) { - /* inode stays locked on return */ - xfs_qm_dqdestroy(dqp); - return XFS_ERROR(ESRCH); - } - /* - * A dquot could be attached to this inode by now, since - * we had dropped the ilock. - */ - if (type == XFS_DQ_USER) { - if (ip->i_udquot) { - xfs_qm_dqdestroy(dqp); - dqp = ip->i_udquot; - xfs_dqlock(dqp); - goto dqret; - } - } else { - if (ip->i_gdquot) { - xfs_qm_dqdestroy(dqp); - dqp = ip->i_gdquot; - xfs_dqlock(dqp); - goto dqret; - } - } - } - - /* - * Hashlock comes after ilock in lock order - */ - XFS_DQ_HASH_LOCK(h); - if (version != h->qh_version) { - xfs_dquot_t *tmpdqp; - /* - * Now, see if somebody else put the dquot in the - * hashtable before us. This can happen because we didn't - * keep the hashchain lock. We don't have to worry about - * lock order between the two dquots here since dqp isn't - * on any findable lists yet. - */ - if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) { - /* - * Duplicate found. Just throw away the new dquot - * and start over. - */ - xfs_qm_dqput(tmpdqp); - XFS_DQ_HASH_UNLOCK(h); - xfs_qm_dqdestroy(dqp); - XQM_STATS_INC(xqmstats.xs_qm_dquot_dups); - goto again; - } - } - - /* - * Put the dquot at the beginning of the hash-chain and mp's list - * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock .. - */ - ASSERT(XFS_DQ_IS_HASH_LOCKED(h)); - dqp->q_hash = h; - XQM_HASHLIST_INSERT(h, dqp); - - /* - * Attach this dquot to this filesystem's list of all dquots, - * kept inside the mount structure in m_quotainfo field - */ - xfs_qm_mplist_lock(mp); - - /* - * We return a locked dquot to the caller, with a reference taken - */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; - - XQM_MPLIST_INSERT(&(XFS_QI_MPL_LIST(mp)), dqp); - - xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_UNLOCK(h); - dqret: - ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip)); - xfs_dqtrace_entry(dqp, "DQGET DONE"); - *O_dqpp = dqp; - return (0); -} - - -/* - * Release a reference to the dquot (decrement ref-count) - * and unlock it. If there is a group quota attached to this - * dquot, carefully release that too without tripping over - * deadlocks'n'stuff. - */ -void -xfs_qm_dqput( - xfs_dquot_t *dqp) -{ - xfs_dquot_t *gdqp; - - ASSERT(dqp->q_nrefs > 0); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - xfs_dqtrace_entry(dqp, "DQPUT"); - - if (dqp->q_nrefs != 1) { - dqp->q_nrefs--; - xfs_dqunlock(dqp); - return; - } - - /* - * drop the dqlock and acquire the freelist and dqlock - * in the right order; but try to get it out-of-order first - */ - if (! xfs_qm_freelist_lock_nowait(xfs_Gqm)) { - xfs_dqtrace_entry(dqp, "DQPUT: FLLOCK-WAIT"); - xfs_dqunlock(dqp); - xfs_qm_freelist_lock(xfs_Gqm); - xfs_dqlock(dqp); - } - - while (1) { - gdqp = NULL; - - /* We can't depend on nrefs being == 1 here */ - if (--dqp->q_nrefs == 0) { - xfs_dqtrace_entry(dqp, "DQPUT: ON FREELIST"); - /* - * insert at end of the freelist. - */ - XQM_FREELIST_INSERT(&(xfs_Gqm->qm_dqfreelist), dqp); - - /* - * If we just added a udquot to the freelist, then - * we want to release the gdquot reference that - * it (probably) has. Otherwise it'll keep the - * gdquot from getting reclaimed. - */ - if ((gdqp = dqp->q_gdquot)) { - /* - * Avoid a recursive dqput call - */ - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - - /* xfs_qm_freelist_print(&(xfs_Gqm->qm_dqfreelist), - "@@@@@++ Free list (after append) @@@@@+"); - */ - } - xfs_dqunlock(dqp); - - /* - * If we had a group quota inside the user quota as a hint, - * release it now. - */ - if (! gdqp) - break; - dqp = gdqp; - } - xfs_qm_freelist_unlock(xfs_Gqm); -} - -/* - * Release a dquot. Flush it if dirty, then dqput() it. - * dquot must not be locked. - */ -void -xfs_qm_dqrele( - xfs_dquot_t *dqp) -{ - ASSERT(dqp); - xfs_dqtrace_entry(dqp, "DQRELE"); - - xfs_dqlock(dqp); - /* - * We don't care to flush it if the dquot is dirty here. - * That will create stutters that we want to avoid. - * Instead we do a delayed write when we try to reclaim - * a dirty dquot. Also xfs_sync will take part of the burden... - */ - xfs_qm_dqput(dqp); -} - - -/* - * Write a modified dquot to disk. - * The dquot must be locked and the flush lock too taken by caller. - * The flush lock will not be unlocked until the dquot reaches the disk, - * but the dquot is free to be unlocked and modified by the caller - * in the interim. Dquot is still locked on return. This behavior is - * identical to that of inodes. - */ -int -xfs_qm_dqflush( - xfs_dquot_t *dqp, - uint flags) -{ - xfs_mount_t *mp; - xfs_buf_t *bp; - xfs_disk_dquot_t *ddqp; - int error; - SPLDECL(s); - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); - xfs_dqtrace_entry(dqp, "DQFLUSH"); - - /* - * If not dirty, nada. - */ - if (!XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqfunlock(dqp); - return (0); - } - - /* - * Cant flush a pinned dquot. Wait for it. - */ - xfs_qm_dqunpin_wait(dqp); - - /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this dquot - * to disk, because the log record didn't make it to disk! - */ - if (XFS_FORCED_SHUTDOWN(dqp->q_mount)) { - dqp->dq_flags &= ~(XFS_DQ_DIRTY); - xfs_dqfunlock(dqp); - return XFS_ERROR(EIO); - } - - /* - * Get the buffer containing the on-disk dquot - * We don't need a transaction envelope because we know that the - * the ondisk-dquot has already been allocated for. - */ - if ((error = xfs_qm_dqtobp(NULL, dqp, &ddqp, &bp, XFS_QMOPT_DOWARN))) { - xfs_dqtrace_entry(dqp, "DQTOBP FAIL"); - ASSERT(error != ENOENT); - /* - * Quotas could have gotten turned off (ESRCH) - */ - xfs_dqfunlock(dqp); - return (error); - } - - if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), - 0, XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { - xfs_force_shutdown(dqp->q_mount, SHUTDOWN_CORRUPT_INCORE); - return XFS_ERROR(EIO); - } - - /* This is the only portion of data that needs to persist */ - memcpy(ddqp, &(dqp->q_core), sizeof(xfs_disk_dquot_t)); - - /* - * Clear the dirty field and remember the flush lsn for later use. - */ - dqp->dq_flags &= ~(XFS_DQ_DIRTY); - mp = dqp->q_mount; - - /* lsn is 64 bits */ - AIL_LOCK(mp, s); - dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn; - AIL_UNLOCK(mp, s); - - /* - * Attach an iodone routine so that we can remove this dquot from the - * AIL and release the flush lock once the dquot is synced to disk. - */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t *, xfs_log_item_t *)) - xfs_qm_dqflush_done, &(dqp->q_logitem.qli_item)); - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for too long. - */ - if (XFS_BUF_ISPINNED(bp)) { - xfs_dqtrace_entry(dqp, "DQFLUSH LOG FORCE"); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - } - - if (flags & XFS_QMOPT_DELWRI) { - xfs_bdwrite(mp, bp); - } else if (flags & XFS_QMOPT_ASYNC) { - xfs_bawrite(mp, bp); - } else { - error = xfs_bwrite(mp, bp); - } - xfs_dqtrace_entry(dqp, "DQFLUSH END"); - /* - * dqp is still locked, but caller is free to unlock it now. - */ - return (error); - -} - -/* - * This is the dquot flushing I/O completion routine. It is called - * from interrupt level when the buffer containing the dquot is - * flushed to disk. It is responsible for removing the dquot logitem - * from the AIL if it has not been re-logged, and unlocking the dquot's - * flush lock. This behavior is very similar to that of inodes.. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_dqflush_done( - xfs_buf_t *bp, - xfs_dq_logitem_t *qip) -{ - xfs_dquot_t *dqp; - SPLDECL(s); - - dqp = qip->qli_dquot; - - /* - * We only want to pull the item from the AIL if its - * location in the log has not changed since we started the flush. - * Thus, we only bother if the dquot's lsn has - * not changed. First we check the lsn outside the lock - * since it's cheaper, and then we recheck while - * holding the lock before removing the dquot from the AIL. - */ - if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - qip->qli_item.li_lsn == qip->qli_flush_lsn) { - - AIL_LOCK(dqp->q_mount, s); - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - if (qip->qli_item.li_lsn == qip->qli_flush_lsn) - xfs_trans_delete_ail(dqp->q_mount, - (xfs_log_item_t*)qip, s); - else - AIL_UNLOCK(dqp->q_mount, s); - } - - /* - * Release the dq's flush lock since we're done with it. - */ - xfs_dqfunlock(dqp); -} - - -int -xfs_qm_dqflock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = cpsema(&((dqp)->q_flock)); - - /* XXX ifdef these out */ - if (locked) - (dqp)->dq_flags |= XFS_DQ_FLOCKED; - return (locked); -} - - -int -xfs_qm_dqlock_nowait( - xfs_dquot_t *dqp) -{ - return (mutex_trylock(&((dqp)->q_qlock))); -} - -void -xfs_dqlock( - xfs_dquot_t *dqp) -{ - mutex_lock(&(dqp->q_qlock)); -} - -void -xfs_dqunlock( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); - if (dqp->q_logitem.qli_dquot == dqp) { - /* Once was dqp->q_mount, but might just have been cleared */ - xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp, - (xfs_log_item_t*)&(dqp->q_logitem)); - } -} - - -void -xfs_dqunlock_nonotify( - xfs_dquot_t *dqp) -{ - mutex_unlock(&(dqp->q_qlock)); -} - -void -xfs_dqlock2( - xfs_dquot_t *d1, - xfs_dquot_t *d2) -{ - if (d1 && d2) { - ASSERT(d1 != d2); - if (be32_to_cpu(d1->q_core.d_id) > - be32_to_cpu(d2->q_core.d_id)) { - xfs_dqlock(d2); - xfs_dqlock(d1); - } else { - xfs_dqlock(d1); - xfs_dqlock(d2); - } - } else { - if (d1) { - xfs_dqlock(d1); - } else if (d2) { - xfs_dqlock(d2); - } - } -} - - -/* - * Take a dquot out of the mount's dqlist as well as the hashlist. - * This is called via unmount as well as quotaoff, and the purge - * will always succeed unless there are soft (temp) references - * outstanding. - * - * This returns 0 if it was purged, 1 if it wasn't. It's not an error code - * that we're returning! XXXsup - not cool. - */ -/* ARGSUSED */ -int -xfs_qm_dqpurge( - xfs_dquot_t *dqp, - uint flags) -{ - xfs_dqhash_t *thishash; - xfs_mount_t *mp; - - mp = dqp->q_mount; - - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); - - xfs_dqlock(dqp); - /* - * We really can't afford to purge a dquot that is - * referenced, because these are hard refs. - * It shouldn't happen in general because we went thru _all_ inodes in - * dqrele_all_inodes before calling this and didn't let the mountlock go. - * However it is possible that we have dquots with temporary - * references that are not attached to an inode. e.g. see xfs_setattr(). - */ - if (dqp->q_nrefs != 0) { - xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(dqp->q_hash); - return (1); - } - - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); - - /* - * If we're turning off quotas, we have to make sure that, for - * example, we don't delete quota disk blocks while dquots are - * in the process of getting written to those disk blocks. - * This dquot might well be on AIL, and we can't leave it there - * if we're turning off quotas. Basically, we need this flush - * lock, and are willing to block on it. - */ - if (! xfs_qm_dqflock_nowait(dqp)) { - /* - * Block on the flush lock after nudging dquot buffer, - * if it is incore. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - - /* - * XXXIf we're turning this type of quotas off, we don't care - * about the dirty metadata sitting in this dquot. OTOH, if - * we're unmounting, we do care, so we flush it and wait. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); - /* dqflush unlocks dqflock */ - /* - * Given that dqpurge is a very rare occurrence, it is OK - * that we're holding the hashlist and mplist locks - * across the disk write. But, ... XXXsup - * - * We don't care about getting disk errors here. We need - * to purge this dquot anyway, so we go ahead regardless. - */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); - xfs_dqflock(dqp); - } - ASSERT(dqp->q_pincount == 0); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); - - thishash = dqp->q_hash; - XQM_HASHLIST_REMOVE(thishash, dqp); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(mp)), dqp); - /* - * XXX Move this to the front of the freelist, if we can get the - * freelist lock. - */ - ASSERT(XFS_DQ_IS_ON_FREELIST(dqp)); - - dqp->q_mount = NULL; - dqp->q_hash = NULL; - dqp->dq_flags = XFS_DQ_INACTIVE; - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(thishash); - return (0); -} - - -#ifdef QUOTADEBUG -void -xfs_qm_dqprint(xfs_dquot_t *dqp) -{ - cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); - cmn_err(CE_DEBUG, "---- dquotID = %d", - (int)be32_to_cpu(dqp->q_core.d_id)); - cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); - cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); - cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); - cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); - cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_blk_hardlimit), - (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); - cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_blk_softlimit), - (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); - cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_ino_hardlimit), - (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); - cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_ino_softlimit), - (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); - cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_bcount), - (int)be64_to_cpu(dqp->q_core.d_bcount)); - cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", - be64_to_cpu(dqp->q_core.d_icount), - (int)be64_to_cpu(dqp->q_core.d_icount)); - cmn_err(CE_DEBUG, "---- btimer = %d", - (int)be32_to_cpu(dqp->q_core.d_btimer)); - cmn_err(CE_DEBUG, "---- itimer = %d", - (int)be32_to_cpu(dqp->q_core.d_itimer)); - cmn_err(CE_DEBUG, "---------------------------"); -} -#endif - -/* - * Give the buffer a little push if it is incore and - * wait on the flush lock. - */ -void -xfs_qm_dqflock_pushbuf_wait( - xfs_dquot_t *dqp) -{ - xfs_buf_t *bp; - - /* - * Check to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - bp = xfs_incore(dqp->q_mount->m_ddev_targp, dqp->q_blkno, - XFS_QI_DQCHUNKLEN(dqp->q_mount), - XFS_INCORE_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(dqp->q_mount, - (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - xfs_bawrite(dqp->q_mount, bp); - } else { - xfs_buf_relse(bp); - } - } - xfs_dqflock(dqp); -} diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h deleted file mode 100644 index 78d3ab95c5f..00000000000 --- a/fs/xfs/quota/xfs_dquot.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DQUOT_H__ -#define __XFS_DQUOT_H__ - -/* - * Dquots are structures that hold quota information about a user or a group, - * much like inodes are for files. In fact, dquots share many characteristics - * with inodes. However, dquots can also be a centralized resource, relative - * to a collection of inodes. In this respect, dquots share some characteristics - * of the superblock. - * XFS dquots exploit both those in its algorithms. They make every attempt - * to not be a bottleneck when quotas are on and have minimal impact, if any, - * when quotas are off. - */ - -/* - * The hash chain headers (hash buckets) - */ -typedef struct xfs_dqhash { - struct xfs_dquot *qh_next; - mutex_t qh_lock; - uint qh_version; /* ever increasing version */ - uint qh_nelems; /* number of dquots on the list */ -} xfs_dqhash_t; - -typedef struct xfs_dqlink { - struct xfs_dquot *ql_next; /* forward link */ - struct xfs_dquot **ql_prevp; /* pointer to prev ql_next */ -} xfs_dqlink_t; - -struct xfs_mount; -struct xfs_trans; - -/* - * This is the marker which is designed to occupy the first few - * bytes of the xfs_dquot_t structure. Even inside this, the freelist pointers - * must come first. - * This serves as the marker ("sentinel") when we have to restart list - * iterations because of locking considerations. - */ -typedef struct xfs_dqmarker { - struct xfs_dquot*dqm_flnext; /* link to freelist: must be first */ - struct xfs_dquot*dqm_flprev; - xfs_dqlink_t dqm_mplist; /* link to mount's list of dquots */ - xfs_dqlink_t dqm_hashlist; /* link to the hash chain */ - uint dqm_flags; /* various flags (XFS_DQ_*) */ -} xfs_dqmarker_t; - -/* - * The incore dquot structure - */ -typedef struct xfs_dquot { - xfs_dqmarker_t q_lists; /* list ptrs, q_flags (marker) */ - xfs_dqhash_t *q_hash; /* the hashchain header */ - struct xfs_mount*q_mount; /* filesystem this relates to */ - struct xfs_trans*q_transp; /* trans this belongs to currently */ - uint q_nrefs; /* # active refs from inodes */ - xfs_daddr_t q_blkno; /* blkno of dquot buffer */ - int q_bufoffset; /* off of dq in buffer (# dquots) */ - xfs_fileoff_t q_fileoffset; /* offset in quotas file */ - - struct xfs_dquot*q_gdquot; /* group dquot, hint only */ - xfs_disk_dquot_t q_core; /* actual usage & quotas */ - xfs_dq_logitem_t q_logitem; /* dquot log item */ - xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ - xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ - xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ - mutex_t q_qlock; /* quota lock */ - sema_t q_flock; /* flush lock */ - uint q_pincount; /* pin count for this dquot */ - sv_t q_pinwait; /* sync var for pinning */ -#ifdef XFS_DQUOT_TRACE - struct ktrace *q_trace; /* trace header structure */ -#endif -} xfs_dquot_t; - - -#define dq_flnext q_lists.dqm_flnext -#define dq_flprev q_lists.dqm_flprev -#define dq_mplist q_lists.dqm_mplist -#define dq_hashlist q_lists.dqm_hashlist -#define dq_flags q_lists.dqm_flags - -#define XFS_DQHOLD(dqp) ((dqp)->q_nrefs++) - -#ifdef DEBUG -static inline int -XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp) -{ - if (mutex_trylock(&dqp->q_qlock)) { - mutex_unlock(&dqp->q_qlock); - return 0; - } - return 1; -} -#endif - - -/* - * The following three routines simply manage the q_flock - * semaphore embedded in the dquot. This semaphore synchronizes - * processes attempting to flush the in-core dquot back to disk. - */ -#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\ - (dqp)->dq_flags |= XFS_DQ_FLOCKED; } -#define xfs_dqfunlock(dqp) { ASSERT(issemalocked(&((dqp)->q_flock))); \ - vsema(&((dqp)->q_flock)); \ - (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); } - -#define XFS_DQ_PINLOCK(dqp) mutex_spinlock( \ - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock)) -#define XFS_DQ_PINUNLOCK(dqp, s) mutex_spinunlock( \ - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s) - -#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock))) -#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp)) -#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) -#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) -#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) -#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -#define XFS_DQ_TO_QINF(dqp) ((dqp)->q_mount->m_quotainfo) -#define XFS_DQ_TO_QIP(dqp) (XFS_QM_ISUDQ(dqp) ? \ - XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \ - XFS_DQ_TO_QINF(dqp)->qi_gquotaip) - -#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \ - (XFS_IS_UQUOTA_ON((d)->q_mount)) : \ - (XFS_IS_OQUOTA_ON((d)->q_mount)))) - -#ifdef XFS_DQUOT_TRACE -/* - * Dquot Tracing stuff. - */ -#define DQUOT_TRACE_SIZE 64 -#define DQUOT_KTRACE_ENTRY 1 - -extern void __xfs_dqtrace_entry(xfs_dquot_t *dqp, char *func, - void *, xfs_inode_t *); -#define xfs_dqtrace_entry_ino(a,b,ip) \ - __xfs_dqtrace_entry((a), (b), (void*)__return_address, (ip)) -#define xfs_dqtrace_entry(a,b) \ - __xfs_dqtrace_entry((a), (b), (void*)__return_address, NULL) -#else -#define xfs_dqtrace_entry(a,b) -#define xfs_dqtrace_entry_ino(a,b,ip) -#endif - -#ifdef QUOTADEBUG -extern void xfs_qm_dqprint(xfs_dquot_t *); -#else -#define xfs_qm_dqprint(a) -#endif - -extern void xfs_qm_dqdestroy(xfs_dquot_t *); -extern int xfs_qm_dqflush(xfs_dquot_t *, uint); -extern int xfs_qm_dqpurge(xfs_dquot_t *, uint); -extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); -extern int xfs_qm_dqlock_nowait(xfs_dquot_t *); -extern int xfs_qm_dqflock_nowait(xfs_dquot_t *); -extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp); -extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, - xfs_disk_dquot_t *); -extern void xfs_qm_adjust_dqlimits(xfs_mount_t *, - xfs_disk_dquot_t *); -extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, - xfs_dqid_t, uint, uint, xfs_dquot_t **); -extern void xfs_qm_dqput(xfs_dquot_t *); -extern void xfs_qm_dqrele(xfs_dquot_t *); -extern void xfs_dqlock(xfs_dquot_t *); -extern void xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *); -extern void xfs_dqunlock(xfs_dquot_t *); -extern void xfs_dqunlock_nonotify(xfs_dquot_t *); - -#endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c deleted file mode 100644 index 5b2dcc58b24..00000000000 --- a/fs/xfs/quota/xfs_dquot_item.c +++ /dev/null @@ -1,699 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_trans_priv.h" -#include "xfs_qm.h" - -/* - * returns the number of iovecs needed to log the given dquot item. - */ -/* ARGSUSED */ -STATIC uint -xfs_qm_dquot_logitem_size( - xfs_dq_logitem_t *logitem) -{ - /* - * we need only two iovecs, one for the format, one for the real thing - */ - return (2); -} - -/* - * fills in the vector of log iovecs for the given dquot log item. - */ -STATIC void -xfs_qm_dquot_logitem_format( - xfs_dq_logitem_t *logitem, - xfs_log_iovec_t *logvec) -{ - ASSERT(logitem); - ASSERT(logitem->qli_dquot); - - logvec->i_addr = (xfs_caddr_t)&logitem->qli_format; - logvec->i_len = sizeof(xfs_dq_logformat_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_QFORMAT); - logvec++; - logvec->i_addr = (xfs_caddr_t)&logitem->qli_dquot->q_core; - logvec->i_len = sizeof(xfs_disk_dquot_t); - XLOG_VEC_SET_TYPE(logvec, XLOG_REG_TYPE_DQUOT); - - ASSERT(2 == logitem->qli_item.li_desc->lid_size); - logitem->qli_format.qlf_size = 2; - -} - -/* - * Increment the pin count of the given dquot. - * This value is protected by pinlock spinlock in the xQM structure. - */ -STATIC void -xfs_qm_dquot_logitem_pin( - xfs_dq_logitem_t *logitem) -{ - unsigned long s; - xfs_dquot_t *dqp; - - dqp = logitem->qli_dquot; - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - s = XFS_DQ_PINLOCK(dqp); - dqp->q_pincount++; - XFS_DQ_PINUNLOCK(dqp, s); -} - -/* - * Decrement the pin count of the given dquot, and wake up - * anyone in xfs_dqwait_unpin() if the count goes to 0. The - * dquot must have been previously pinned with a call to xfs_dqpin(). - */ -/* ARGSUSED */ -STATIC void -xfs_qm_dquot_logitem_unpin( - xfs_dq_logitem_t *logitem, - int stale) -{ - unsigned long s; - xfs_dquot_t *dqp; - - dqp = logitem->qli_dquot; - ASSERT(dqp->q_pincount > 0); - s = XFS_DQ_PINLOCK(dqp); - dqp->q_pincount--; - if (dqp->q_pincount == 0) { - sv_broadcast(&dqp->q_pinwait); - } - XFS_DQ_PINUNLOCK(dqp, s); -} - -/* ARGSUSED */ -STATIC void -xfs_qm_dquot_logitem_unpin_remove( - xfs_dq_logitem_t *logitem, - xfs_trans_t *tp) -{ - xfs_qm_dquot_logitem_unpin(logitem, 0); -} - -/* - * Given the logitem, this writes the corresponding dquot entry to disk - * asynchronously. This is called with the dquot entry securely locked; - * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot - * at the end. - */ -STATIC void -xfs_qm_dquot_logitem_push( - xfs_dq_logitem_t *logitem) -{ - xfs_dquot_t *dqp; - - dqp = logitem->qli_dquot; - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp)); - - /* - * Since we were able to lock the dquot's flush lock and - * we found it on the AIL, the dquot must be dirty. This - * is because the dquot is removed from the AIL while still - * holding the flush lock in xfs_dqflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the dquot. - */ - xfs_qm_dqflush(dqp, XFS_B_DELWRI); - xfs_dqunlock(dqp); -} - -/*ARGSUSED*/ -STATIC xfs_lsn_t -xfs_qm_dquot_logitem_committed( - xfs_dq_logitem_t *l, - xfs_lsn_t lsn) -{ - /* - * We always re-log the entire dquot when it becomes dirty, - * so, the latest copy _is_ the only one that matters. - */ - return (lsn); -} - - -/* - * This is called to wait for the given dquot to be unpinned. - * Most of these pin/unpin routines are plagiarized from inode code. - */ -void -xfs_qm_dqunpin_wait( - xfs_dquot_t *dqp) -{ - SPLDECL(s); - - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (dqp->q_pincount == 0) { - return; - } - - /* - * Give the log a push so we don't wait here too long. - */ - xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE); - s = XFS_DQ_PINLOCK(dqp); - if (dqp->q_pincount == 0) { - XFS_DQ_PINUNLOCK(dqp, s); - return; - } - sv_wait(&(dqp->q_pinwait), PINOD, - &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s); -} - -/* - * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that - * the dquot is locked by us, but the flush lock isn't. So, here we are - * going to see if the relevant dquot buffer is incore, waiting on DELWRI. - * If so, we want to push it out to help us take this item off the AIL as soon - * as possible. - * - * We must not be holding the AIL_LOCK at this point. Calling incore() to - * search the buffer cache can be a time consuming thing, and AIL_LOCK is a - * spinlock. - */ -STATIC void -xfs_qm_dquot_logitem_pushbuf( - xfs_dq_logitem_t *qip) -{ - xfs_dquot_t *dqp; - xfs_mount_t *mp; - xfs_buf_t *bp; - uint dopush; - - dqp = qip->qli_dquot; - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * The qli_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(qip->qli_pushbuf_flag != 0); - ASSERT(qip->qli_push_owner == current_pid()); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (!issemalocked(&(dqp->q_flock)) || - ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - return; - } - mp = dqp->q_mount; - bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, - XFS_QI_DQCHUNKLEN(mp), - XFS_INCORE_TRYLOCK); - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(dqp->q_flock))); - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - if (dopush) { -#ifdef XFSRACEDEBUG - delay_for_intr(); - delay(300); -#endif - xfs_bawrite(mp, bp); - } else { - xfs_buf_relse(bp); - } - } else { - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); - xfs_buf_relse(bp); - } - return; - } - - qip->qli_pushbuf_flag = 0; - xfs_dqunlock(dqp); -} - -/* - * This is called to attempt to lock the dquot associated with this - * dquot log item. Don't sleep on the dquot lock or the flush lock. - * If the flush lock is already held, indicating that the dquot has - * been or is in the process of being flushed, then see if we can - * find the dquot's buffer in the buffer cache without sleeping. If - * we can and it is marked delayed write, then we want to send it out. - * We delay doing so until the push routine, though, to avoid sleeping - * in any device strategy routines. - */ -STATIC uint -xfs_qm_dquot_logitem_trylock( - xfs_dq_logitem_t *qip) -{ - xfs_dquot_t *dqp; - uint retval; - - dqp = qip->qli_dquot; - if (dqp->q_pincount > 0) - return (XFS_ITEM_PINNED); - - if (! xfs_qm_dqlock_nowait(dqp)) - return (XFS_ITEM_LOCKED); - - retval = XFS_ITEM_SUCCESS; - if (! xfs_qm_dqflock_nowait(dqp)) { - /* - * The dquot is already being flushed. It may have been - * flushed delayed write, however, and we don't want to - * get stuck waiting for that to complete. So, we want to check - * to see if we can lock the dquot's buffer without sleeping. - * If we can and it is marked for delayed write, then we - * hold it and send it out from the push routine. We don't - * want to do that now since we might sleep in the device - * strategy routine. We also don't want to grab the buffer lock - * here because we'd like not to call into the buffer cache - * while holding the AIL_LOCK. - * Make sure to only return PUSHBUF if we set pushbuf_flag - * ourselves. If someone else is doing it then we don't - * want to go to the push routine and duplicate their efforts. - */ - if (qip->qli_pushbuf_flag == 0) { - qip->qli_pushbuf_flag = 1; - ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno); -#ifdef DEBUG - qip->qli_push_owner = current_pid(); -#endif - /* - * The dquot is left locked. - */ - retval = XFS_ITEM_PUSHBUF; - } else { - retval = XFS_ITEM_FLUSHING; - xfs_dqunlock_nonotify(dqp); - } - } - - ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); - return (retval); -} - - -/* - * Unlock the dquot associated with the log item. - * Clear the fields of the dquot and dquot log item that - * are specific to the current transaction. If the - * hold flags is set, do not unlock the dquot. - */ -STATIC void -xfs_qm_dquot_logitem_unlock( - xfs_dq_logitem_t *ql) -{ - xfs_dquot_t *dqp; - - ASSERT(ql != NULL); - dqp = ql->qli_dquot; - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - - /* - * Clear the transaction pointer in the dquot - */ - dqp->q_transp = NULL; - - /* - * dquots are never 'held' from getting unlocked at the end of - * a transaction. Their locking and unlocking is hidden inside the - * transaction layer, within trans_commit. Hence, no LI_HOLD flag - * for the logitem. - */ - xfs_dqunlock(dqp); -} - - -/* - * The transaction with the dquot locked has aborted. The dquot - * must not be dirty within the transaction. We simply unlock just - * as if the transaction had been cancelled. - */ -STATIC void -xfs_qm_dquot_logitem_abort( - xfs_dq_logitem_t *ql) -{ - xfs_qm_dquot_logitem_unlock(ql); -} - -/* - * this needs to stamp an lsn into the dquot, I think. - * rpc's that look at user dquot's would then have to - * push on the dependency recorded in the dquot - */ -/* ARGSUSED */ -STATIC void -xfs_qm_dquot_logitem_committing( - xfs_dq_logitem_t *l, - xfs_lsn_t lsn) -{ - return; -} - - -/* - * This is the ops vector for dquots - */ -STATIC struct xfs_item_ops xfs_dquot_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_dquot_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_dquot_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_qm_dquot_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_dquot_logitem_abort, - .iop_pushbuf = (void(*)(xfs_log_item_t*)) - xfs_qm_dquot_logitem_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_dquot_logitem_committing -}; - -/* - * Initialize the dquot log item for a newly allocated dquot. - * The dquot isn't locked at this point, but it isn't on any of the lists - * either, so we don't care. - */ -void -xfs_qm_dquot_logitem_init( - struct xfs_dquot *dqp) -{ - xfs_dq_logitem_t *lp; - lp = &dqp->q_logitem; - - lp->qli_item.li_type = XFS_LI_DQUOT; - lp->qli_item.li_ops = &xfs_dquot_item_ops; - lp->qli_item.li_mountp = dqp->q_mount; - lp->qli_dquot = dqp; - lp->qli_format.qlf_type = XFS_LI_DQUOT; - lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id); - lp->qli_format.qlf_blkno = dqp->q_blkno; - lp->qli_format.qlf_len = 1; - /* - * This is just the offset of this dquot within its buffer - * (which is currently 1 FSB and probably won't change). - * Hence 32 bits for this offset should be just fine. - * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t)) - * here, and recompute it at recovery time. - */ - lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset; -} - -/*------------------ QUOTAOFF LOG ITEMS -------------------*/ - -/* - * This returns the number of iovecs needed to log the given quotaoff item. - * We only need 1 iovec for an quotaoff item. It just logs the - * quotaoff_log_format structure. - */ -/*ARGSUSED*/ -STATIC uint -xfs_qm_qoff_logitem_size(xfs_qoff_logitem_t *qf) -{ - return (1); -} - -/* - * This is called to fill in the vector of log iovecs for the - * given quotaoff log item. We use only 1 iovec, and we point that - * at the quotaoff_log_format structure embedded in the quotaoff item. - * It is at this point that we assert that all of the extent - * slots in the quotaoff item have been filled. - */ -STATIC void -xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t *qf, - xfs_log_iovec_t *log_vector) -{ - ASSERT(qf->qql_format.qf_type == XFS_LI_QUOTAOFF); - - log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format); - log_vector->i_len = sizeof(xfs_qoff_logitem_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF); - qf->qql_format.qf_size = 1; -} - - -/* - * Pinning has no meaning for an quotaoff item, so just return. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_pin(xfs_qoff_logitem_t *qf) -{ - return; -} - - -/* - * Since pinning has no meaning for an quotaoff item, unpinning does - * not either. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_unpin(xfs_qoff_logitem_t *qf, int stale) -{ - return; -} - -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_unpin_remove(xfs_qoff_logitem_t *qf, xfs_trans_t *tp) -{ - return; -} - -/* - * Quotaoff items have no locking, so just return success. - */ -/*ARGSUSED*/ -STATIC uint -xfs_qm_qoff_logitem_trylock(xfs_qoff_logitem_t *qf) -{ - return XFS_ITEM_LOCKED; -} - -/* - * Quotaoff items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_unlock(xfs_qoff_logitem_t *qf) -{ - return; -} - -/* - * The quotaoff-start-item is logged only once and cannot be moved in the log, - * so simply return the lsn at which it's been logged. - */ -/*ARGSUSED*/ -STATIC xfs_lsn_t -xfs_qm_qoff_logitem_committed(xfs_qoff_logitem_t *qf, xfs_lsn_t lsn) -{ - return (lsn); -} - -/* - * The transaction of which this QUOTAOFF is a part has been aborted. - * Just clean up after ourselves. - * Shouldn't this never happen in the case of qoffend logitems? XXX - */ -STATIC void -xfs_qm_qoff_logitem_abort(xfs_qoff_logitem_t *qf) -{ - kmem_free(qf, sizeof(xfs_qoff_logitem_t)); -} - -/* - * There isn't much you can do to push on an quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -/*ARGSUSED*/ -STATIC void -xfs_qm_qoff_logitem_push(xfs_qoff_logitem_t *qf) -{ - return; -} - - -/*ARGSUSED*/ -STATIC xfs_lsn_t -xfs_qm_qoffend_logitem_committed( - xfs_qoff_logitem_t *qfe, - xfs_lsn_t lsn) -{ - xfs_qoff_logitem_t *qfs; - SPLDECL(s); - - qfs = qfe->qql_start_lip; - AIL_LOCK(qfs->qql_item.li_mountp,s); - /* - * Delete the qoff-start logitem from the AIL. - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs, s); - kmem_free(qfs, sizeof(xfs_qoff_logitem_t)); - kmem_free(qfe, sizeof(xfs_qoff_logitem_t)); - return (xfs_lsn_t)-1; -} - -/* - * XXX rcc - don't know quite what to do with this. I think we can - * just ignore it. The only time that isn't the case is if we allow - * the client to somehow see that quotas have been turned off in which - * we can't allow that to get back until the quotaoff hits the disk. - * So how would that happen? Also, do we need different routines for - * quotaoff start and quotaoff end? I suspect the answer is yes but - * to be sure, I need to look at the recovery code and see how quota off - * recovery is handled (do we roll forward or back or do something else). - * If we roll forwards or backwards, then we need two separate routines, - * one that does nothing and one that stamps in the lsn that matters - * (truly makes the quotaoff irrevocable). If we do something else, - * then maybe we don't need two. - */ -/* ARGSUSED */ -STATIC void -xfs_qm_qoff_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) -{ - return; -} - -/* ARGSUSED */ -STATIC void -xfs_qm_qoffend_logitem_committing(xfs_qoff_logitem_t *qip, xfs_lsn_t commit_lsn) -{ - return; -} - -STATIC struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t* ,int)) - xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoffend_logitem_committing -}; - -/* - * This is the ops vector shared by all quotaoff-start log items. - */ -STATIC struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_qm_qoff_logitem_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int)) - xfs_qm_qoff_logitem_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*,xfs_trans_t*)) - xfs_qm_qoff_logitem_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_qm_qoff_logitem_abort, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_qm_qoff_logitem_committing -}; - -/* - * Allocate and initialize an quotaoff item of the correct quota type(s). - */ -xfs_qoff_logitem_t * -xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - xfs_qoff_logitem_t *start, - uint flags) -{ - xfs_qoff_logitem_t *qf; - - qf = (xfs_qoff_logitem_t*) kmem_zalloc(sizeof(xfs_qoff_logitem_t), KM_SLEEP); - - qf->qql_item.li_type = XFS_LI_QUOTAOFF; - if (start) - qf->qql_item.li_ops = &xfs_qm_qoffend_logitem_ops; - else - qf->qql_item.li_ops = &xfs_qm_qoff_logitem_ops; - qf->qql_item.li_mountp = mp; - qf->qql_format.qf_type = XFS_LI_QUOTAOFF; - qf->qql_format.qf_flags = flags; - qf->qql_start_lip = start; - return (qf); -} diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c deleted file mode 100644 index e23e45535c4..00000000000 --- a/fs/xfs/quota/xfs_qm.c +++ /dev/null @@ -1,2856 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_clnt.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_bmap.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_trans_space.h" -#include "xfs_utils.h" -#include "xfs_qm.h" - -/* - * The global quota manager. There is only one of these for the entire - * system, _not_ one per file system. XQM keeps track of the overall - * quota functionality, including maintaining the freelist and hash - * tables of dquots. - */ -mutex_t xfs_Gqm_lock; -struct xfs_qm *xfs_Gqm; -uint ndquot; - -kmem_zone_t *qm_dqzone; -kmem_zone_t *qm_dqtrxzone; -STATIC kmem_shaker_t xfs_qm_shaker; - -STATIC cred_t xfs_zerocr; -STATIC xfs_inode_t xfs_zeroino; - -STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); -STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); - -STATIC void xfs_qm_freelist_init(xfs_frlist_t *); -STATIC void xfs_qm_freelist_destroy(xfs_frlist_t *); -STATIC int xfs_qm_mplist_nowait(xfs_mount_t *); -STATIC int xfs_qm_dqhashlock_nowait(xfs_dquot_t *); - -STATIC int xfs_qm_init_quotainos(xfs_mount_t *); -STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); - -#ifdef DEBUG -extern mutex_t qcheck_lock; -#endif - -#ifdef QUOTADEBUG -#define XQM_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dquot_t *dqp; int i = 0; \ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (l)->qh_next; dqp != NULL; dqp = dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " \ - "bcnt = %d, icnt = %d, refs = %d", \ - ++i, (int) be32_to_cpu(dqp->q_core.d_id), \ - DQFLAGTO_TYPESTR(dqp), \ - (int) be64_to_cpu(dqp->q_core.d_bcount), \ - (int) be64_to_cpu(dqp->q_core.d_icount), \ - (int) dqp->q_nrefs); } \ -} -#else -#define XQM_LIST_PRINT(l, NXT, title) do { } while (0) -#endif - -/* - * Initialize the XQM structure. - * Note that there is not one quota manager per file system. - */ -STATIC struct xfs_qm * -xfs_Gqm_init(void) -{ - xfs_dqhash_t *udqhash, *gdqhash; - xfs_qm_t *xqm; - uint i, hsize, flags = KM_SLEEP | KM_MAYFAIL; - - /* - * Initialize the dquot hash tables. - */ - hsize = XFS_QM_HASHSIZE_HIGH; - while (!(udqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), flags))) { - if ((hsize >>= 1) <= XFS_QM_HASHSIZE_LOW) - flags = KM_SLEEP; - } - gdqhash = kmem_zalloc(hsize * sizeof(xfs_dqhash_t), KM_SLEEP); - ndquot = hsize << 8; - - xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP); - xqm->qm_dqhashmask = hsize - 1; - xqm->qm_usr_dqhtable = udqhash; - xqm->qm_grp_dqhtable = gdqhash; - ASSERT(xqm->qm_usr_dqhtable != NULL); - ASSERT(xqm->qm_grp_dqhtable != NULL); - - for (i = 0; i < hsize; i++) { - xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i); - xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i); - } - - /* - * Freelist of all dquots of all file systems - */ - xfs_qm_freelist_init(&(xqm->qm_dqfreelist)); - - /* - * dquot zone. we register our own low-memory callback. - */ - if (!qm_dqzone) { - xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t), - "xfs_dquots"); - qm_dqzone = xqm->qm_dqzone; - } else - xqm->qm_dqzone = qm_dqzone; - - xfs_qm_shaker = kmem_shake_register(xfs_qm_shake); - - /* - * The t_dqinfo portion of transactions. - */ - if (!qm_dqtrxzone) { - xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t), - "xfs_dqtrx"); - qm_dqtrxzone = xqm->qm_dqtrxzone; - } else - xqm->qm_dqtrxzone = qm_dqtrxzone; - - atomic_set(&xqm->qm_totaldquots, 0); - xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO; - xqm->qm_nrefs = 0; -#ifdef DEBUG - mutex_init(&qcheck_lock); -#endif - return xqm; -} - -/* - * Destroy the global quota manager when its reference count goes to zero. - */ -STATIC void -xfs_qm_destroy( - struct xfs_qm *xqm) -{ - int hsize, i; - - ASSERT(xqm != NULL); - ASSERT(xqm->qm_nrefs == 0); - kmem_shake_deregister(xfs_qm_shaker); - hsize = xqm->qm_dqhashmask + 1; - for (i = 0; i < hsize; i++) { - xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i])); - xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i])); - } - kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t)); - kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t)); - xqm->qm_usr_dqhtable = NULL; - xqm->qm_grp_dqhtable = NULL; - xqm->qm_dqhashmask = 0; - xfs_qm_freelist_destroy(&(xqm->qm_dqfreelist)); -#ifdef DEBUG - mutex_destroy(&qcheck_lock); -#endif - kmem_free(xqm, sizeof(xfs_qm_t)); -} - -/* - * Called at mount time to let XQM know that another file system is - * starting quotas. This isn't crucial information as the individual mount - * structures are pretty independent, but it helps the XQM keep a - * global view of what's going on. - */ -/* ARGSUSED */ -STATIC int -xfs_qm_hold_quotafs_ref( - struct xfs_mount *mp) -{ - /* - * Need to lock the xfs_Gqm structure for things like this. For example, - * the structure could disappear between the entry to this routine and - * a HOLD operation if not locked. - */ - XFS_QM_LOCK(xfs_Gqm); - - if (xfs_Gqm == NULL) - xfs_Gqm = xfs_Gqm_init(); - /* - * We can keep a list of all filesystems with quotas mounted for - * debugging and statistical purposes, but ... - * Just take a reference and get out. - */ - XFS_QM_HOLD(xfs_Gqm); - XFS_QM_UNLOCK(xfs_Gqm); - - return 0; -} - - -/* - * Release the reference that a filesystem took at mount time, - * so that we know when we need to destroy the entire quota manager. - */ -/* ARGSUSED */ -STATIC void -xfs_qm_rele_quotafs_ref( - struct xfs_mount *mp) -{ - xfs_dquot_t *dqp, *nextdqp; - - ASSERT(xfs_Gqm); - ASSERT(xfs_Gqm->qm_nrefs > 0); - - /* - * Go thru the freelist and destroy all inactive dquots. - */ - xfs_qm_freelist_lock(xfs_Gqm); - - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - dqp != (xfs_dquot_t *)&(xfs_Gqm->qm_dqfreelist); ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - } else { - xfs_dqunlock(dqp); - } - dqp = nextdqp; - } - xfs_qm_freelist_unlock(xfs_Gqm); - - /* - * Destroy the entire XQM. If somebody mounts with quotaon, this'll - * be restarted. - */ - XFS_QM_LOCK(xfs_Gqm); - XFS_QM_RELE(xfs_Gqm); - if (xfs_Gqm->qm_nrefs == 0) { - xfs_qm_destroy(xfs_Gqm); - xfs_Gqm = NULL; - } - XFS_QM_UNLOCK(xfs_Gqm); -} - -/* - * This is called at mount time from xfs_mountfs to initialize the quotainfo - * structure and start the global quota manager (xfs_Gqm) if it hasn't done - * so already. Note that the superblock has not been read in yet. - */ -void -xfs_qm_mount_quotainit( - xfs_mount_t *mp, - uint flags) -{ - /* - * User, projects or group quotas has to be on. - */ - ASSERT(flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)); - - /* - * Initialize the flags in the mount structure. From this point - * onwards we look at m_qflags to figure out if quotas's ON/OFF, etc. - * Note that we enforce nothing if accounting is off. - * ie. XFSMNT_*QUOTA must be ON for XFSMNT_*QUOTAENF. - * It isn't necessary to take the quotaoff lock to do this; this is - * called from mount. - */ - if (flags & XFSMNT_UQUOTA) { - mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); - if (flags & XFSMNT_UQUOTAENF) - mp->m_qflags |= XFS_UQUOTA_ENFD; - } - if (flags & XFSMNT_GQUOTA) { - mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); - if (flags & XFSMNT_GQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } else if (flags & XFSMNT_PQUOTA) { - mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); - if (flags & XFSMNT_PQUOTAENF) - mp->m_qflags |= XFS_OQUOTA_ENFD; - } -} - -/* - * Just destroy the quotainfo structure. - */ -void -xfs_qm_unmount_quotadestroy( - xfs_mount_t *mp) -{ - if (mp->m_quotainfo) - xfs_qm_destroy_quotainfo(mp); -} - - -/* - * This is called from xfs_mountfs to start quotas and initialize all - * necessary data structures like quotainfo. This is also responsible for - * running a quotacheck as necessary. We are guaranteed that the superblock - * is consistently read in at this point. - */ -int -xfs_qm_mount_quotas( - xfs_mount_t *mp, - int mfsi_flags) -{ - unsigned long s; - int error = 0; - uint sbf; - - - /* - * If quotas on realtime volumes is not supported, we disable - * quotas immediately. - */ - if (mp->m_sb.sb_rextents) { - cmn_err(CE_NOTE, - "Cannot turn on quotas for realtime filesystem %s", - mp->m_fsname); - mp->m_qflags = 0; - goto write_changes; - } - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - /* - * Allocate the quotainfo structure inside the mount struct, and - * create quotainode(s), and change/rev superblock if necessary. - */ - if ((error = xfs_qm_init_quotainfo(mp))) { - /* - * We must turn off quotas. - */ - ASSERT(mp->m_quotainfo == NULL); - mp->m_qflags = 0; - goto write_changes; - } - /* - * If any of the quotas are not consistent, do a quotacheck. - */ - if (XFS_QM_NEED_QUOTACHECK(mp) && - !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { - if ((error = xfs_qm_quotacheck(mp))) { - /* Quotacheck has failed and quotas have - * been disabled. - */ - return XFS_ERROR(error); - } - } - - write_changes: - /* - * We actually don't have to acquire the SB_LOCK at all. - * This can only be called from mount, and that's single threaded. XXX - */ - s = XFS_SB_LOCK(mp); - sbf = mp->m_sb.sb_qflags; - mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; - XFS_SB_UNLOCK(mp, s); - - if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { - if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { - /* - * We could only have been turning quotas off. - * We aren't in very good shape actually because - * the incore structures are convinced that quotas are - * off, but the on disk superblock doesn't know that ! - */ - ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); - xfs_fs_cmn_err(CE_ALERT, mp, - "XFS mount_quotas: Superblock update failed!"); - } - } - - if (error) { - xfs_fs_cmn_err(CE_WARN, mp, - "Failed to initialize disk quotas."); - } - return XFS_ERROR(error); -} - -/* - * Called from the vfsops layer. - */ -int -xfs_qm_unmount_quotas( - xfs_mount_t *mp) -{ - xfs_inode_t *uqp, *gqp; - int error = 0; - - /* - * Release the dquots that root inode, et al might be holding, - * before we flush quotas and blow away the quotainfo structure. - */ - ASSERT(mp->m_rootip); - xfs_qm_dqdetach(mp->m_rootip); - if (mp->m_rbmip) - xfs_qm_dqdetach(mp->m_rbmip); - if (mp->m_rsumip) - xfs_qm_dqdetach(mp->m_rsumip); - - /* - * Flush out the quota inodes. - */ - uqp = gqp = NULL; - if (mp->m_quotainfo) { - if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) { - xfs_ilock(uqp, XFS_ILOCK_EXCL); - xfs_iflock(uqp); - error = xfs_iflush(uqp, XFS_IFLUSH_SYNC); - xfs_iunlock(uqp, XFS_ILOCK_EXCL); - if (unlikely(error == EFSCORRUPTED)) { - XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)", - XFS_ERRLEVEL_LOW, mp); - goto out; - } - } - if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) { - xfs_ilock(gqp, XFS_ILOCK_EXCL); - xfs_iflock(gqp); - error = xfs_iflush(gqp, XFS_IFLUSH_SYNC); - xfs_iunlock(gqp, XFS_ILOCK_EXCL); - if (unlikely(error == EFSCORRUPTED)) { - XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)", - XFS_ERRLEVEL_LOW, mp); - goto out; - } - } - } - if (uqp) { - XFS_PURGE_INODE(uqp); - mp->m_quotainfo->qi_uquotaip = NULL; - } - if (gqp) { - XFS_PURGE_INODE(gqp); - mp->m_quotainfo->qi_gquotaip = NULL; - } -out: - return XFS_ERROR(error); -} - -/* - * Flush all dquots of the given file system to disk. The dquots are - * _not_ purged from memory here, just their data written to disk. - */ -STATIC int -xfs_qm_dqflush_all( - xfs_mount_t *mp, - int flags) -{ - int recl; - xfs_dquot_t *dqp; - int niters; - int error; - - if (mp->m_quotainfo == NULL) - return 0; - niters = 0; -again: - xfs_qm_mplist_lock(mp); - FOREACH_DQUOT_IN_MP(dqp, mp) { - xfs_dqlock(dqp); - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY"); - /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { - /* - * If we can't grab the flush lock then check - * to see if the dquot has been flushed delayed - * write. If so, grab its buffer and send it - * out immediately. We'll be able to acquire - * the flush lock when the I/O completes. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write. - */ - xfs_qm_mplist_unlock(mp); - error = xfs_qm_dqflush(dqp, flags); - xfs_dqunlock(dqp); - if (error) - return error; - - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - xfs_qm_mplist_unlock(mp); - /* XXX restart limit */ - goto again; - } - } - - xfs_qm_mplist_unlock(mp); - /* return ! busy */ - return 0; -} -/* - * Release the group dquot pointers the user dquots may be - * carrying around as a hint. mplist is locked on entry and exit. - */ -STATIC void -xfs_qm_detach_gdquots( - xfs_mount_t *mp) -{ - xfs_dquot_t *dqp, *gdqp; - int nrecl; - - again: - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { - xfs_dqlock(dqp); - if ((gdqp = dqp->q_gdquot)) { - xfs_dqlock(gdqp); - dqp->q_gdquot = NULL; - } - xfs_dqunlock(dqp); - - if (gdqp) { - /* - * Can't hold the mplist lock across a dqput. - * XXXmust convert to marker based iterations here. - */ - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); - xfs_qm_dqput(gdqp); - - xfs_qm_mplist_lock(mp); - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) - goto again; - } - dqp = dqp->MPL_NEXT; - } -} - -/* - * Go through all the incore dquots of this file system and take them - * off the mplist and hashlist, if the dquot type matches the dqtype - * parameter. This is used when turning off quota accounting for - * users and/or groups, as well as when the filesystem is unmounting. - */ -STATIC int -xfs_qm_dqpurge_int( - xfs_mount_t *mp, - uint flags) /* QUOTAOFF/UMOUNTING/UQUOTA/PQUOTA/GQUOTA */ -{ - xfs_dquot_t *dqp; - uint dqtype; - int nrecl; - xfs_dquot_t *nextdqp; - int nmisses; - - if (mp->m_quotainfo == NULL) - return 0; - - dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0; - dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0; - dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0; - - xfs_qm_mplist_lock(mp); - - /* - * In the first pass through all incore dquots of this filesystem, - * we release the group dquot pointers the user dquots may be - * carrying around as a hint. We need to do this irrespective of - * what's being turned off. - */ - xfs_qm_detach_gdquots(mp); - - again: - nmisses = 0; - ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); - /* - * Try to get rid of all of the unwanted dquots. The idea is to - * get them off mplist and hashlist, but leave them on freelist. - */ - dqp = XFS_QI_MPLNEXT(mp); - while (dqp) { - /* - * It's OK to look at the type without taking dqlock here. - * We're holding the mplist lock here, and that's needed for - * a dqreclaim. - */ - if ((dqp->dq_flags & dqtype) == 0) { - dqp = dqp->MPL_NEXT; - continue; - } - - if (! xfs_qm_dqhashlock_nowait(dqp)) { - nrecl = XFS_QI_MPLRECLAIMS(mp); - xfs_qm_mplist_unlock(mp); - XFS_DQ_HASH_LOCK(dqp->q_hash); - xfs_qm_mplist_lock(mp); - - /* - * XXXTheoretically, we can get into a very long - * ping pong game here. - * No one can be adding dquots to the mplist at - * this point, but somebody might be taking things off. - */ - if (nrecl != XFS_QI_MPLRECLAIMS(mp)) { - XFS_DQ_HASH_UNLOCK(dqp->q_hash); - goto again; - } - } - - /* - * Take the dquot off the mplist and hashlist. It may remain on - * freelist in INACTIVE state. - */ - nextdqp = dqp->MPL_NEXT; - nmisses += xfs_qm_dqpurge(dqp, flags); - dqp = nextdqp; - } - xfs_qm_mplist_unlock(mp); - return nmisses; -} - -int -xfs_qm_dqpurge_all( - xfs_mount_t *mp, - uint flags) -{ - int ndquots; - - /* - * Purge the dquot cache. - * None of the dquots should really be busy at this point. - */ - if (mp->m_quotainfo) { - while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) { - delay(ndquots * 10); - } - } - return 0; -} - -STATIC int -xfs_qm_dqattach_one( - xfs_inode_t *ip, - xfs_dqid_t id, - uint type, - uint doalloc, - uint dolock, - xfs_dquot_t *udqhint, /* hint */ - xfs_dquot_t **IO_idqpp) -{ - xfs_dquot_t *dqp; - int error; - - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - error = 0; - /* - * See if we already have it in the inode itself. IO_idqpp is - * &i_udquot or &i_gdquot. This made the code look weird, but - * made the logic a lot simpler. - */ - if ((dqp = *IO_idqpp)) { - if (dolock) - xfs_dqlock(dqp); - xfs_dqtrace_entry(dqp, "DQATTACH: found in ip"); - goto done; - } - - /* - * udqhint is the i_udquot field in inode, and is non-NULL only - * when the type arg is group/project. Its purpose is to save a - * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside - * the user dquot. - */ - ASSERT(!udqhint || type == XFS_DQ_GROUP || type == XFS_DQ_PROJ); - if (udqhint && !dolock) - xfs_dqlock(udqhint); - - /* - * No need to take dqlock to look at the id. - * The ID can't change until it gets reclaimed, and it won't - * be reclaimed as long as we have a ref from inode and we hold - * the ilock. - */ - if (udqhint && - (dqp = udqhint->q_gdquot) && - (be32_to_cpu(dqp->q_core.d_id) == id)) { - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - xfs_dqlock(dqp); - XFS_DQHOLD(dqp); - ASSERT(*IO_idqpp == NULL); - *IO_idqpp = dqp; - if (!dolock) { - xfs_dqunlock(dqp); - xfs_dqunlock(udqhint); - } - goto done; - } - /* - * We can't hold a dquot lock when we call the dqget code. - * We'll deadlock in no time, because of (not conforming to) - * lock ordering - the inodelock comes before any dquot lock, - * and we may drop and reacquire the ilock in xfs_qm_dqget(). - */ - if (udqhint) - xfs_dqunlock(udqhint); - /* - * Find the dquot from somewhere. This bumps the - * reference count of dquot and returns it locked. - * This can return ENOENT if dquot didn't exist on - * disk and we didn't ask it to allocate; - * ESRCH if quotas got turned off suddenly. - */ - if ((error = xfs_qm_dqget(ip->i_mount, ip, id, type, - doalloc|XFS_QMOPT_DOWARN, &dqp))) { - if (udqhint && dolock) - xfs_dqlock(udqhint); - goto done; - } - - xfs_dqtrace_entry(dqp, "DQATTACH: found by dqget"); - /* - * dqget may have dropped and re-acquired the ilock, but it guarantees - * that the dquot returned is the one that should go in the inode. - */ - *IO_idqpp = dqp; - ASSERT(dqp); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! dolock) { - xfs_dqunlock(dqp); - goto done; - } - if (! udqhint) - goto done; - - ASSERT(udqhint); - ASSERT(dolock); - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - if (! xfs_qm_dqlock_nowait(udqhint)) { - xfs_dqunlock(dqp); - xfs_dqlock(udqhint); - xfs_dqlock(dqp); - } - done: -#ifdef QUOTADEBUG - if (udqhint) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(udqhint)); - } - if (! error) { - if (dolock) - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - } -#endif - return error; -} - - -/* - * Given a udquot and gdquot, attach a ptr to the group dquot in the - * udquot as a hint for future lookups. The idea sounds simple, but the - * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering constraints. - * The process is complicated more by the fact that the dquots may or may not - * be locked on entry. - */ -STATIC void -xfs_qm_dqattach_grouphint( - xfs_dquot_t *udq, - xfs_dquot_t *gdq, - uint locked) -{ - xfs_dquot_t *tmp; - -#ifdef QUOTADEBUG - if (locked) { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - } -#endif - if (! locked) - xfs_dqlock(udq); - - if ((tmp = udq->q_gdquot)) { - if (tmp == gdq) { - if (! locked) - xfs_dqunlock(udq); - return; - } - - udq->q_gdquot = NULL; - /* - * We can't keep any dqlocks when calling dqrele, - * because the freelist lock comes before dqlocks. - */ - xfs_dqunlock(udq); - if (locked) - xfs_dqunlock(gdq); - /* - * we took a hard reference once upon a time in dqget, - * so give it back when the udquot no longer points at it - * dqput() does the unlocking of the dquot. - */ - xfs_qm_dqrele(tmp); - - xfs_dqlock(udq); - xfs_dqlock(gdq); - - } else { - ASSERT(XFS_DQ_IS_LOCKED(udq)); - if (! locked) { - xfs_dqlock(gdq); - } - } - - ASSERT(XFS_DQ_IS_LOCKED(udq)); - ASSERT(XFS_DQ_IS_LOCKED(gdq)); - /* - * Somebody could have attached a gdquot here, - * when we dropped the uqlock. If so, just do nothing. - */ - if (udq->q_gdquot == NULL) { - XFS_DQHOLD(gdq); - udq->q_gdquot = gdq; - } - if (! locked) { - xfs_dqunlock(gdq); - xfs_dqunlock(udq); - } -} - - -/* - * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON - * into account. - * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. - * If XFS_QMOPT_DQLOCK, the dquot(s) will be returned locked. This option pretty - * much made this code a complete mess, but it has been pretty useful. - * If XFS_QMOPT_ILOCKED, then inode sent is already locked EXCL. - * Inode may get unlocked and relocked in here, and the caller must deal with - * the consequences. - */ -int -xfs_qm_dqattach( - xfs_inode_t *ip, - uint flags) -{ - xfs_mount_t *mp = ip->i_mount; - uint nquotas = 0; - int error = 0; - - if ((! XFS_IS_QUOTA_ON(mp)) || - (! XFS_NOT_DQATTACHED(mp, ip)) || - (ip->i_ino == mp->m_sb.sb_uquotino) || - (ip->i_ino == mp->m_sb.sb_gquotino)) - return 0; - - ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || - XFS_ISLOCKED_INODE_EXCL(ip)); - - if (! (flags & XFS_QMOPT_ILOCKED)) - xfs_ilock(ip, XFS_ILOCK_EXCL); - - if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, - NULL, &ip->i_udquot); - if (error) - goto done; - nquotas++; - } - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - if (XFS_IS_OQUOTA_ON(mp)) { - error = XFS_IS_GQUOTA_ON(mp) ? - xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, - ip->i_udquot, &ip->i_gdquot) : - xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ, - flags & XFS_QMOPT_DQALLOC, - flags & XFS_QMOPT_DQLOCK, - ip->i_udquot, &ip->i_gdquot); - /* - * Don't worry about the udquot that we may have - * attached above. It'll get detached, if not already. - */ - if (error) - goto done; - nquotas++; - } - - /* - * Attach this group quota to the user quota as a hint. - * This WON'T, in general, result in a thrash. - */ - if (nquotas == 2) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - ASSERT(ip->i_udquot); - ASSERT(ip->i_gdquot); - - /* - * We may or may not have the i_udquot locked at this point, - * but this check is OK since we don't depend on the i_gdquot to - * be accurate 100% all the time. It is just a hint, and this - * will succeed in general. - */ - if (ip->i_udquot->q_gdquot == ip->i_gdquot) - goto done; - /* - * Attach i_gdquot to the gdquot hint inside the i_udquot. - */ - xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot, - flags & XFS_QMOPT_DQLOCK); - } - - done: - -#ifdef QUOTADEBUG - if (! error) { - if (ip->i_udquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_udquot)); - } - if (ip->i_gdquot) { - if (flags & XFS_QMOPT_DQLOCK) - ASSERT(XFS_DQ_IS_LOCKED(ip->i_gdquot)); - } - if (XFS_IS_UQUOTA_ON(mp)) - ASSERT(ip->i_udquot); - if (XFS_IS_OQUOTA_ON(mp)) - ASSERT(ip->i_gdquot); - } -#endif - - if (! (flags & XFS_QMOPT_ILOCKED)) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - -#ifdef QUOTADEBUG - else - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); -#endif - return error; -} - -/* - * Release dquots (and their references) if any. - * The inode should be locked EXCL except when this's called by - * xfs_ireclaim. - */ -void -xfs_qm_dqdetach( - xfs_inode_t *ip) -{ - if (!(ip->i_udquot || ip->i_gdquot)) - return; - - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino); - ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino); - if (ip->i_udquot) { - xfs_dqtrace_entry_ino(ip->i_udquot, "DQDETTACH", ip); - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if (ip->i_gdquot) { - xfs_dqtrace_entry_ino(ip->i_gdquot, "DQDETTACH", ip); - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } -} - -/* - * This is called by VFS_SYNC and flags arg determines the caller, - * and its motives, as done in xfs_sync. - * - * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31 - * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25 - * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA - */ - -int -xfs_qm_sync( - xfs_mount_t *mp, - short flags) -{ - int recl, restarts; - xfs_dquot_t *dqp; - uint flush_flags; - boolean_t nowait; - int error; - - restarts = 0; - /* - * We won't block unless we are asked to. - */ - nowait = (boolean_t)(flags & SYNC_BDFLUSH || (flags & SYNC_WAIT) == 0); - - again: - xfs_qm_mplist_lock(mp); - /* - * dqpurge_all() also takes the mplist lock and iterate thru all dquots - * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared - * when we have the mplist lock, we know that dquots will be consistent - * as long as we have it locked. - */ - if (! XFS_IS_QUOTA_ON(mp)) { - xfs_qm_mplist_unlock(mp); - return 0; - } - FOREACH_DQUOT_IN_MP(dqp, mp) { - /* - * If this is vfs_sync calling, then skip the dquots that - * don't 'seem' to be dirty. ie. don't acquire dqlock. - * This is very similar to what xfs_sync does with inodes. - */ - if (flags & SYNC_BDFLUSH) { - if (! XFS_DQ_IS_DIRTY(dqp)) - continue; - } - - if (nowait) { - /* - * Try to acquire the dquot lock. We are NOT out of - * lock order, but we just don't want to wait for this - * lock, unless somebody wanted us to. - */ - if (! xfs_qm_dqlock_nowait(dqp)) - continue; - } else { - xfs_dqlock(dqp); - } - - /* - * Now, find out for sure if this dquot is dirty or not. - */ - if (! XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* XXX a sentinel would be better */ - recl = XFS_QI_MPLRECLAIMS(mp); - if (! xfs_qm_dqflock_nowait(dqp)) { - if (nowait) { - xfs_dqunlock(dqp); - continue; - } - /* - * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, so - * see if we can give a push to the buffer before we wait - * on the flush lock. At this point, we know that - * even though the dquot is being flushed, - * it has (new) dirty data. - */ - xfs_qm_dqflock_pushbuf_wait(dqp); - } - /* - * Let go of the mplist lock. We don't want to hold it - * across a disk write - */ - flush_flags = (nowait) ? XFS_QMOPT_DELWRI : XFS_QMOPT_SYNC; - xfs_qm_mplist_unlock(mp); - xfs_dqtrace_entry(dqp, "XQM_SYNC: DQFLUSH"); - error = xfs_qm_dqflush(dqp, flush_flags); - xfs_dqunlock(dqp); - if (error && XFS_FORCED_SHUTDOWN(mp)) - return 0; /* Need to prevent umount failure */ - else if (error) - return error; - - xfs_qm_mplist_lock(mp); - if (recl != XFS_QI_MPLRECLAIMS(mp)) { - if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS) - break; - - xfs_qm_mplist_unlock(mp); - goto again; - } - } - - xfs_qm_mplist_unlock(mp); - return 0; -} - - -/* - * This initializes all the quota information that's kept in the - * mount structure - */ -STATIC int -xfs_qm_init_quotainfo( - xfs_mount_t *mp) -{ - xfs_quotainfo_t *qinf; - int error; - xfs_dquot_t *dqp; - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - /* - * Tell XQM that we exist as soon as possible. - */ - if ((error = xfs_qm_hold_quotafs_ref(mp))) { - return error; - } - - qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - - /* - * See if quotainodes are setup, and if not, allocate them, - * and change the superblock accordingly. - */ - if ((error = xfs_qm_init_quotainos(mp))) { - kmem_free(qinf, sizeof(xfs_quotainfo_t)); - mp->m_quotainfo = NULL; - return error; - } - - spinlock_init(&qinf->qi_pinlock, "xfs_qinf_pin"); - xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0); - qinf->qi_dqreclaims = 0; - - /* mutex used to serialize quotaoffs */ - mutex_init(&qinf->qi_quotaofflock); - - /* Precalc some constants */ - qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - ASSERT(qinf->qi_dqchunklen); - qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen); - do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t)); - - mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - - /* - * We try to get the limits from the superuser's limits fields. - * This is quite hacky, but it is standard quota practice. - * We look at the USR dquot with id == 0 first, but if user quotas - * are not enabled we goto the GRP dquot with id == 0. - * We don't really care to keep separate default limits for user - * and group quotas, at least not at this point. - */ - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN, - &dqp); - if (! error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; - - /* - * The warnings and timers set the grace period given to - * a user or group before he or she can not perform any - * more writing. If it is zero, a default is used. - */ - qinf->qi_btimelimit = ddqp->d_btimer ? - be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = ddqp->d_itimer ? - be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? - be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = ddqp->d_bwarns ? - be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = ddqp->d_iwarns ? - be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? - be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; - qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - - /* - * We sent the XFS_QMOPT_DQSUSER flag to dqget because - * we don't want this dquot cached. We haven't done a - * quotacheck yet, and quotacheck doesn't like incore dquots. - */ - xfs_qm_dqdestroy(dqp); - } else { - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; - } - - return 0; -} - - -/* - * Gets called when unmounting a filesystem or when all quotas get - * turned off. - * This purges the quota inodes, destroys locks and frees itself. - */ -void -xfs_qm_destroy_quotainfo( - xfs_mount_t *mp) -{ - xfs_quotainfo_t *qi; - - qi = mp->m_quotainfo; - ASSERT(qi != NULL); - ASSERT(xfs_Gqm != NULL); - - /* - * Release the reference that XQM kept, so that we know - * when the XQM structure should be freed. We cannot assume - * that xfs_Gqm is non-null after this point. - */ - xfs_qm_rele_quotafs_ref(mp); - - spinlock_destroy(&qi->qi_pinlock); - xfs_qm_list_destroy(&qi->qi_dqlist); - - if (qi->qi_uquotaip) { - XFS_PURGE_INODE(qi->qi_uquotaip); - qi->qi_uquotaip = NULL; /* paranoia */ - } - if (qi->qi_gquotaip) { - XFS_PURGE_INODE(qi->qi_gquotaip); - qi->qi_gquotaip = NULL; - } - mutex_destroy(&qi->qi_quotaofflock); - kmem_free(qi, sizeof(xfs_quotainfo_t)); - mp->m_quotainfo = NULL; -} - - - -/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */ - -/* ARGSUSED */ -STATIC void -xfs_qm_list_init( - xfs_dqlist_t *list, - char *str, - int n) -{ - mutex_init(&list->qh_lock); - list->qh_next = NULL; - list->qh_version = 0; - list->qh_nelems = 0; -} - -STATIC void -xfs_qm_list_destroy( - xfs_dqlist_t *list) -{ - mutex_destroy(&(list->qh_lock)); -} - - -/* - * Stripped down version of dqattach. This doesn't attach, or even look at the - * dquots attached to the inode. The rationale is that there won't be any - * attached at the time this is called from quotacheck. - */ -STATIC int -xfs_qm_dqget_noattach( - xfs_inode_t *ip, - xfs_dquot_t **O_udqpp, - xfs_dquot_t **O_gdqpp) -{ - int error; - xfs_mount_t *mp; - xfs_dquot_t *udqp, *gdqp; - - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - mp = ip->i_mount; - udqp = NULL; - gdqp = NULL; - - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(ip->i_udquot == NULL); - /* - * We want the dquot allocated if it doesn't exist. - */ - if ((error = xfs_qm_dqget(mp, ip, ip->i_d.di_uid, XFS_DQ_USER, - XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, - &udqp))) { - /* - * Shouldn't be able to turn off quotas here. - */ - ASSERT(error != ESRCH); - ASSERT(error != ENOENT); - return error; - } - ASSERT(udqp); - } - - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(ip->i_gdquot == NULL); - if (udqp) - xfs_dqunlock(udqp); - error = XFS_IS_GQUOTA_ON(mp) ? - xfs_qm_dqget(mp, ip, - ip->i_d.di_gid, XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, - &gdqp) : - xfs_qm_dqget(mp, ip, - ip->i_d.di_projid, XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC|XFS_QMOPT_DOWARN, - &gdqp); - if (error) { - if (udqp) - xfs_qm_dqrele(udqp); - ASSERT(error != ESRCH); - ASSERT(error != ENOENT); - return error; - } - ASSERT(gdqp); - - /* Reacquire the locks in the right order */ - if (udqp) { - if (! xfs_qm_dqlock_nowait(udqp)) { - xfs_dqunlock(gdqp); - xfs_dqlock(udqp); - xfs_dqlock(gdqp); - } - } - } - - *O_udqpp = udqp; - *O_gdqpp = gdqp; - -#ifdef QUOTADEBUG - if (udqp) ASSERT(XFS_DQ_IS_LOCKED(udqp)); - if (gdqp) ASSERT(XFS_DQ_IS_LOCKED(gdqp)); -#endif - return 0; -} - -/* - * Create an inode and return with a reference already taken, but unlocked - * This is how we create quota inodes - */ -STATIC int -xfs_qm_qino_alloc( - xfs_mount_t *mp, - xfs_inode_t **ip, - __int64_t sbfields, - uint flags) -{ - xfs_trans_t *tp; - int error; - unsigned long s; - int committed; - - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); - if ((error = xfs_trans_reserve(tp, - XFS_QM_QINOCREATE_SPACE_RES(mp), - XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_CREATE_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - return error; - } - - if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, - &xfs_zerocr, 0, 1, ip, &committed))) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - return error; - } - - /* - * Keep an extra reference to this quota inode. This inode is - * locked exclusively and joined to the transaction already. - */ - ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip)); - VN_HOLD(XFS_ITOV((*ip))); - - /* - * Make the changes in the superblock, and log those too. - * sbfields arg may contain fields other than *QUOTINO; - * VERSIONNUM for example. - */ - s = XFS_SB_LOCK(mp); - if (flags & XFS_QMOPT_SBVERSION) { -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - unsigned oldv = mp->m_sb.sb_versionnum; -#endif - ASSERT(!XFS_SB_VERSION_HASQUOTA(&mp->m_sb)); - ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) == - (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS)); - - XFS_SB_VERSION_ADDQUOTA(&mp->m_sb); - mp->m_sb.sb_uquotino = NULLFSINO; - mp->m_sb.sb_gquotino = NULLFSINO; - - /* qflags will get updated _after_ quotacheck */ - mp->m_sb.sb_qflags = 0; -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - cmn_err(CE_NOTE, - "Old superblock version %x, converting to %x.", - oldv, mp->m_sb.sb_versionnum); -#endif - } - if (flags & XFS_QMOPT_UQUOTA) - mp->m_sb.sb_uquotino = (*ip)->i_ino; - else - mp->m_sb.sb_gquotino = (*ip)->i_ino; - XFS_SB_UNLOCK(mp, s); - xfs_mod_sb(tp, sbfields); - - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL))) { - xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); - return error; - } - return 0; -} - - -STATIC int -xfs_qm_reset_dqcounts( - xfs_mount_t *mp, - xfs_buf_t *bp, - xfs_dqid_t id, - uint type) -{ - xfs_disk_dquot_t *ddq; - int j; - - xfs_buftrace("RESET DQUOTS", bp); - /* - * Reset all counters and timers. They'll be - * started afresh by xfs_qm_quotacheck. - */ -#ifdef DEBUG - j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - do_div(j, sizeof(xfs_dqblk_t)); - ASSERT(XFS_QM_DQPERBLK(mp) == j); -#endif - ddq = (xfs_disk_dquot_t *)XFS_BUF_PTR(bp); - for (j = 0; j < XFS_QM_DQPERBLK(mp); j++) { - /* - * Do a sanity check, and if needed, repair the dqblk. Don't - * output any warnings because it's perfectly possible to - * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. - */ - (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, - "xfs_quotacheck"); - ddq->d_bcount = 0; - ddq->d_icount = 0; - ddq->d_rtbcount = 0; - ddq->d_btimer = 0; - ddq->d_itimer = 0; - ddq->d_rtbtimer = 0; - ddq->d_bwarns = 0; - ddq->d_iwarns = 0; - ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); - } - - return 0; -} - -STATIC int -xfs_qm_dqiter_bufs( - xfs_mount_t *mp, - xfs_dqid_t firstid, - xfs_fsblock_t bno, - xfs_filblks_t blkcnt, - uint flags) -{ - xfs_buf_t *bp; - int error; - int notcommitted; - int incr; - int type; - - ASSERT(blkcnt > 0); - notcommitted = 0; - incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ? - XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt; - type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : - (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); - error = 0; - - /* - * Blkcnt arg can be a very big number, and might even be - * larger than the log itself. So, we have to break it up into - * manageable-sized transactions. - * Note that we don't start a permanent transaction here; we might - * not be able to get a log reservation for the whole thing up front, - * and we don't really care to either, because we just discard - * everything if we were to crash in the middle of this loop. - */ - while (blkcnt--) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, bno), - (int)XFS_QI_DQCHUNKLEN(mp), 0, &bp); - if (error) - break; - - (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type); - xfs_bdwrite(mp, bp); - /* - * goto the next block. - */ - bno++; - firstid += XFS_QM_DQPERBLK(mp); - } - return error; -} - -/* - * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a - * caller supplied function for every chunk of dquots that we find. - */ -STATIC int -xfs_qm_dqiterate( - xfs_mount_t *mp, - xfs_inode_t *qip, - uint flags) -{ - xfs_bmbt_irec_t *map; - int i, nmaps; /* number of map entries */ - int error; /* return value */ - xfs_fileoff_t lblkno; - xfs_filblks_t maxlblkcnt; - xfs_dqid_t firstid; - xfs_fsblock_t rablkno; - xfs_filblks_t rablkcnt; - - error = 0; - /* - * This looks racy, but we can't keep an inode lock across a - * trans_reserve. But, this gets called during quotacheck, and that - * happens only at mount time which is single threaded. - */ - if (qip->i_d.di_nblocks == 0) - return 0; - - map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); - - lblkno = 0; - maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - do { - nmaps = XFS_DQITER_MAP_SIZE; - /* - * We aren't changing the inode itself. Just changing - * some of its data. No new blocks are added here, and - * the inode is never added to the transaction. - */ - xfs_ilock(qip, XFS_ILOCK_SHARED); - error = xfs_bmapi(NULL, qip, lblkno, - maxlblkcnt - lblkno, - XFS_BMAPI_METADATA, - NULL, - 0, map, &nmaps, NULL, NULL); - xfs_iunlock(qip, XFS_ILOCK_SHARED); - if (error) - break; - - ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); - for (i = 0; i < nmaps; i++) { - ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); - ASSERT(map[i].br_blockcount); - - - lblkno += map[i].br_blockcount; - - if (map[i].br_startblock == HOLESTARTBLOCK) - continue; - - firstid = (xfs_dqid_t) map[i].br_startoff * - XFS_QM_DQPERBLK(mp); - /* - * Do a read-ahead on the next extent. - */ - if ((i+1 < nmaps) && - (map[i+1].br_startblock != HOLESTARTBLOCK)) { - rablkcnt = map[i+1].br_blockcount; - rablkno = map[i+1].br_startblock; - while (rablkcnt--) { - xfs_baread(mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, rablkno), - (int)XFS_QI_DQCHUNKLEN(mp)); - rablkno++; - } - } - /* - * Iterate thru all the blks in the extent and - * reset the counters of all the dquots inside them. - */ - if ((error = xfs_qm_dqiter_bufs(mp, - firstid, - map[i].br_startblock, - map[i].br_blockcount, - flags))) { - break; - } - } - - if (error) - break; - } while (nmaps > 0); - - kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map)); - - return error; -} - -/* - * Called by dqusage_adjust in doing a quotacheck. - * Given the inode, and a dquot (either USR or GRP, doesn't matter), - * this updates its incore copy as well as the buffer copy. This is - * so that once the quotacheck is done, we can just log all the buffers, - * as opposed to logging numerous updates to individual dquots. - */ -STATIC void -xfs_qm_quotacheck_dqadjust( - xfs_dquot_t *dqp, - xfs_qcnt_t nblks, - xfs_qcnt_t rtblks) -{ - ASSERT(XFS_DQ_IS_LOCKED(dqp)); - xfs_dqtrace_entry(dqp, "QCHECK DQADJUST"); - /* - * Adjust the inode count and the block count to reflect this inode's - * resource usage. - */ - be64_add(&dqp->q_core.d_icount, 1); - dqp->q_res_icount++; - if (nblks) { - be64_add(&dqp->q_core.d_bcount, nblks); - dqp->q_res_bcount += nblks; - } - if (rtblks) { - be64_add(&dqp->q_core.d_rtbcount, rtblks); - dqp->q_res_rtbcount += rtblks; - } - - /* - * Set default limits, adjust timers (since we changed usages) - */ - if (! XFS_IS_SUSER_DQUOT(dqp)) { - xfs_qm_adjust_dqlimits(dqp->q_mount, &dqp->q_core); - xfs_qm_adjust_dqtimers(dqp->q_mount, &dqp->q_core); - } - - dqp->dq_flags |= XFS_DQ_DIRTY; -} - -STATIC int -xfs_qm_get_rtblks( - xfs_inode_t *ip, - xfs_qcnt_t *O_rtblks) -{ - xfs_filblks_t rtblks; /* total rt blks */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ - int error; - - ASSERT(XFS_IS_REALTIME_INODE(ip)); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) - return error; - } - rtblks = 0; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0; idx < nextents; idx++) { - ep = xfs_iext_get_ext(ifp, idx); - rtblks += xfs_bmbt_get_blockcount(ep); - } - *O_rtblks = (xfs_qcnt_t)rtblks; - return 0; -} - -/* - * callback routine supplied to bulkstat(). Given an inumber, find its - * dquots and update them to account for resources taken by that inode. - */ -/* ARGSUSED */ -STATIC int -xfs_qm_dqusage_adjust( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* not used */ - int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ - int *ubused, /* not used */ - void *dip, /* on-disk inode pointer (not used) */ - int *res) /* result code value */ -{ - xfs_inode_t *ip; - xfs_dquot_t *udqp, *gdqp; - xfs_qcnt_t nblks, rtblks; - int error; - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - /* - * rootino must have its resources accounted for, not so with the quota - * inodes. - */ - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(EINVAL); - } - - /* - * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget - * interface expects the inode to be exclusively locked because that's - * the case in all other instances. It's OK that we do this because - * quotacheck is done only at mount time. - */ - if ((error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip, bno))) { - *res = BULKSTAT_RV_NOTHING; - return error; - } - - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, XFS_ILOCK_EXCL); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - - /* - * Obtain the locked dquots. In case of an error (eg. allocation - * fails for ENOSPC), we return the negative of the error number - * to bulkstat, so that it can get propagated to quotacheck() and - * making us disable quotas for the file system. - */ - if ((error = xfs_qm_dqget_noattach(ip, &udqp, &gdqp))) { - xfs_iput(ip, XFS_ILOCK_EXCL); - *res = BULKSTAT_RV_GIVEUP; - return error; - } - - rtblks = 0; - if (! XFS_IS_REALTIME_INODE(ip)) { - nblks = (xfs_qcnt_t)ip->i_d.di_nblocks; - } else { - /* - * Walk thru the extent list and count the realtime blocks. - */ - if ((error = xfs_qm_get_rtblks(ip, &rtblks))) { - xfs_iput(ip, XFS_ILOCK_EXCL); - if (udqp) - xfs_qm_dqput(udqp); - if (gdqp) - xfs_qm_dqput(gdqp); - *res = BULKSTAT_RV_GIVEUP; - return error; - } - nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; - } - ASSERT(ip->i_delayed_blks == 0); - - /* - * We can't release the inode while holding its dquot locks. - * The inode can go into inactive and might try to acquire the dquotlocks. - * So, just unlock here and do a vn_rele at the end. - */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* - * Add the (disk blocks and inode) resources occupied by this - * inode to its dquots. We do this adjustment in the incore dquot, - * and also copy the changes to its buffer. - * We don't care about putting these changes in a transaction - * envelope because if we crash in the middle of a 'quotacheck' - * we have to start from the beginning anyway. - * Once we're done, we'll log all the dquot bufs. - * - * The *QUOTA_ON checks below may look pretty racy, but quotachecks - * and quotaoffs don't race. (Quotachecks happen at mount time only). - */ - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(udqp); - xfs_qm_quotacheck_dqadjust(udqp, nblks, rtblks); - xfs_qm_dqput(udqp); - } - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(gdqp); - xfs_qm_quotacheck_dqadjust(gdqp, nblks, rtblks); - xfs_qm_dqput(gdqp); - } - /* - * Now release the inode. This will send it to 'inactive', and - * possibly even free blocks. - */ - VN_RELE(XFS_ITOV(ip)); - - /* - * Goto next inode. - */ - *res = BULKSTAT_RV_DIDONE; - return 0; -} - -/* - * Walk thru all the filesystem inodes and construct a consistent view - * of the disk quota world. If the quotacheck fails, disable quotas. - */ -int -xfs_qm_quotacheck( - xfs_mount_t *mp) -{ - int done, count, error; - xfs_ino_t lastino; - size_t structsz; - xfs_inode_t *uip, *gip; - uint flags; - - count = INT_MAX; - structsz = 1; - lastino = 0; - flags = 0; - - ASSERT(XFS_QI_UQIP(mp) || XFS_QI_GQIP(mp)); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - /* - * There should be no cached dquots. The (simplistic) quotacheck - * algorithm doesn't like that. - */ - ASSERT(XFS_QI_MPLNDQUOTS(mp) == 0); - - cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); - - /* - * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset - * their counters to zero. We need a clean slate. - * We don't log our changes till later. - */ - if ((uip = XFS_QI_UQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA))) - goto error_return; - flags |= XFS_UQUOTA_CHKD; - } - - if ((gip = XFS_QI_GQIP(mp))) { - if ((error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA))) - goto error_return; - flags |= XFS_OQUOTA_CHKD; - } - - do { - /* - * Iterate thru all the inodes in the file system, - * adjusting the corresponding dquot counters in core. - */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_dqusage_adjust, NULL, - structsz, NULL, BULKSTAT_FG_IGET, &done))) - break; - - } while (! done); - - /* - * We can get this error if we couldn't do a dquot allocation inside - * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the - * dirty dquots that might be cached, we just want to get rid of them - * and turn quotaoff. The dquots won't be attached to any of the inodes - * at this point (because we intentionally didn't in dqget_noattach). - */ - if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); - goto error_return; - } - /* - * We've made all the changes that we need to make incore. - * Now flush_them down to disk buffers. - */ - xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI); - - /* - * We didn't log anything, because if we crashed, we'll have to - * start the quotacheck from scratch anyway. However, we must make - * sure that our dquot changes are secure before we put the - * quotacheck'd stamp on the superblock. So, here we do a synchronous - * flush. - */ - XFS_bflush(mp->m_ddev_targp); - - /* - * If one type of quotas is off, then it will lose its - * quotachecked status, since we won't be doing accounting for - * that type anymore. - */ - mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD); - mp->m_qflags |= flags; - - XQM_LIST_PRINT(&(XFS_QI_MPL_LIST(mp)), MPL_NEXT, "++++ Mp list +++"); - - error_return: - if (error) { - cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " - "Disabling quotas.", - mp->m_fsname, error); - /* - * We must turn off quotas. - */ - ASSERT(mp->m_quotainfo != NULL); - ASSERT(xfs_Gqm != NULL); - xfs_qm_destroy_quotainfo(mp); - (void)xfs_mount_reset_sbqflags(mp); - } else { - cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); - } - return (error); -} - -/* - * This is called after the superblock has been read in and we're ready to - * iget the quota inodes. - */ -STATIC int -xfs_qm_init_quotainos( - xfs_mount_t *mp) -{ - xfs_inode_t *uip, *gip; - int error; - __int64_t sbflags; - uint flags; - - ASSERT(mp->m_quotainfo); - uip = gip = NULL; - sbflags = 0; - flags = 0; - - /* - * Get the uquota and gquota inodes - */ - if (XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { - if (XFS_IS_UQUOTA_ON(mp) && - mp->m_sb.sb_uquotino != NULLFSINO) { - ASSERT(mp->m_sb.sb_uquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0))) - return XFS_ERROR(error); - } - if (XFS_IS_OQUOTA_ON(mp) && - mp->m_sb.sb_gquotino != NULLFSINO) { - ASSERT(mp->m_sb.sb_gquotino > 0); - if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0))) { - if (uip) - VN_RELE(XFS_ITOV(uip)); - return XFS_ERROR(error); - } - } - } else { - flags |= XFS_QMOPT_SBVERSION; - sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | - XFS_SB_GQUOTINO | XFS_SB_QFLAGS); - } - - /* - * Create the two inodes, if they don't exist already. The changes - * made above will get added to a transaction and logged in one of - * the qino_alloc calls below. If the device is readonly, - * temporarily switch to read-write to do this. - */ - if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { - if ((error = xfs_qm_qino_alloc(mp, &uip, - sbflags | XFS_SB_UQUOTINO, - flags | XFS_QMOPT_UQUOTA))) - return XFS_ERROR(error); - - flags &= ~XFS_QMOPT_SBVERSION; - } - if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) { - flags |= (XFS_IS_GQUOTA_ON(mp) ? - XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); - error = xfs_qm_qino_alloc(mp, &gip, - sbflags | XFS_SB_GQUOTINO, flags); - if (error) { - if (uip) - VN_RELE(XFS_ITOV(uip)); - - return XFS_ERROR(error); - } - } - - XFS_QI_UQIP(mp) = uip; - XFS_QI_GQIP(mp) = gip; - - return 0; -} - - -/* - * Traverse the freelist of dquots and attempt to reclaim a maximum of - * 'howmany' dquots. This operation races with dqlookup(), and attempts to - * favor the lookup function ... - * XXXsup merge this with qm_reclaim_one(). - */ -STATIC int -xfs_qm_shake_freelist( - int howmany) -{ - int nreclaimed; - xfs_dqhash_t *hash; - xfs_dquot_t *dqp, *nextdqp; - int restarts; - int nflushes; - - if (howmany <= 0) - return 0; - - nreclaimed = 0; - restarts = 0; - nflushes = 0; - -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake free 0x%x", howmany); -#endif - /* lock order is : hashchainlock, freelistlock, mplistlock */ - tryagain: - xfs_qm_freelist_lock(xfs_Gqm); - - for (dqp = xfs_Gqm->qm_dqfreelist.qh_next; - ((dqp != (xfs_dquot_t *) &xfs_Gqm->qm_dqfreelist) && - nreclaimed < howmany); ) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto tryagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - nextdqp = dqp->dq_flnext; - goto off_freelist; - } - - ASSERT(dqp->MPL_PREVP); - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (! xfs_qm_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); - /* - * We flush it delayed write, so don't bother - * releasing the mplock. - */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - dqp = dqp->dq_flnext; - continue; - } - /* - * We're trying to get the hashlock out of order. This races - * with dqlookup; so, we giveup and goto the next dquot if - * we couldn't get the hashlock. This way, we won't starve - * a dqlookup process that holds the hashlock that is - * waiting for the freelist lock. - */ - if (! xfs_qm_dqhashlock_nowait(dqp)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - dqp = dqp->dq_flnext; - continue; - } - /* - * This races with dquot allocation code as well as dqflush_all - * and reclaim code. So, if we failed to grab the mplist lock, - * giveup everything and start over. - */ - hash = dqp->q_hash; - ASSERT(hash); - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - /* XXX put a sentinel so that we can come back here */ - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - XFS_DQ_HASH_UNLOCK(hash); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return nreclaimed; - goto tryagain; - } - xfs_dqtrace_entry(dqp, "DQSHAKE: UNLINKING"); -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "Shake 0x%p, ID 0x%x\n", - dqp, be32_to_cpu(dqp->q_core.d_id)); -#endif - ASSERT(dqp->q_nrefs == 0); - nextdqp = dqp->dq_flnext; - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(hash, dqp); - xfs_dqfunlock(dqp); - xfs_qm_mplist_unlock(dqp->q_mount); - XFS_DQ_HASH_UNLOCK(hash); - - off_freelist: - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - nreclaimed++; - XQM_STATS_INC(xqmstats.xs_qm_dqshake_reclaims); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - xfs_qm_freelist_unlock(xfs_Gqm); - return nreclaimed; -} - - -/* - * The kmem_shake interface is invoked when memory is running low. - */ -/* ARGSUSED */ -STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) -{ - int ndqused, nfree, n; - - if (!kmem_shake_allow(gfp_mask)) - return 0; - if (!xfs_Gqm) - return 0; - - nfree = xfs_Gqm->qm_dqfreelist.qh_nelems; /* free dquots */ - /* incore dquots in all f/s's */ - ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree; - - ASSERT(ndqused >= 0); - - if (nfree <= ndqused && nfree < ndquot) - return 0; - - ndqused *= xfs_Gqm->qm_dqfree_ratio; /* target # of free dquots */ - n = nfree - ndqused - ndquot; /* # over target */ - - return xfs_qm_shake_freelist(MAX(nfree, n)); -} - - -/* - * Just pop the least recently used dquot off the freelist and - * recycle it. The returned dquot is locked. - */ -STATIC xfs_dquot_t * -xfs_qm_dqreclaim_one(void) -{ - xfs_dquot_t *dqpout; - xfs_dquot_t *dqp; - int restarts; - int nflushes; - - restarts = 0; - dqpout = NULL; - nflushes = 0; - - /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */ - startagain: - xfs_qm_freelist_lock(xfs_Gqm); - - FOREACH_DQUOT_IN_FREELIST(dqp, &(xfs_Gqm->qm_dqfreelist)) { - xfs_dqlock(dqp); - - /* - * We are racing with dqlookup here. Naturally we don't - * want to reclaim a dquot that lookup wants. We release the - * freelist lock and start over, so that lookup will grab - * both the dquot and the freelistlock. - */ - if (dqp->dq_flags & XFS_DQ_WANT) { - ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE)); - xfs_dqtrace_entry(dqp, "DQRECLAIM: DQWANT"); - xfs_dqunlock(dqp); - xfs_qm_freelist_unlock(xfs_Gqm); - if (++restarts >= XFS_QM_RECLAIM_MAX_RESTARTS) - return NULL; - XQM_STATS_INC(xqmstats.xs_qm_dqwants); - goto startagain; - } - - /* - * If the dquot is inactive, we are assured that it is - * not on the mplist or the hashlist, and that makes our - * life easier. - */ - if (dqp->dq_flags & XFS_DQ_INACTIVE) { - ASSERT(dqp->q_mount == NULL); - ASSERT(! XFS_DQ_IS_DIRTY(dqp)); - ASSERT(dqp->HL_PREVP == NULL); - ASSERT(dqp->MPL_PREVP == NULL); - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - dqpout = dqp; - XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims); - break; - } - - ASSERT(dqp->q_hash); - ASSERT(dqp->MPL_PREVP); - - /* - * Try to grab the flush lock. If this dquot is in the process of - * getting flushed to disk, we don't want to reclaim it. - */ - if (! xfs_qm_dqflock_nowait(dqp)) { - xfs_dqunlock(dqp); - continue; - } - - /* - * We have the flush lock so we know that this is not in the - * process of being flushed. So, if this is dirty, flush it - * DELWRI so that we don't get a freelist infested with - * dirty dquots. - */ - if (XFS_DQ_IS_DIRTY(dqp)) { - xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); - /* - * We flush it delayed write, so don't bother - * releasing the freelist lock. - */ - (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); - xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ - continue; - } - - if (! xfs_qm_mplist_nowait(dqp->q_mount)) { - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - continue; - } - - if (! xfs_qm_dqhashlock_nowait(dqp)) - goto mplistunlock; - - ASSERT(dqp->q_nrefs == 0); - xfs_dqtrace_entry(dqp, "DQRECLAIM: UNLINKING"); - XQM_MPLIST_REMOVE(&(XFS_QI_MPL_LIST(dqp->q_mount)), dqp); - XQM_HASHLIST_REMOVE(dqp->q_hash, dqp); - XQM_FREELIST_REMOVE(dqp); - dqpout = dqp; - XFS_DQ_HASH_UNLOCK(dqp->q_hash); - mplistunlock: - xfs_qm_mplist_unlock(dqp->q_mount); - xfs_dqfunlock(dqp); - xfs_dqunlock(dqp); - if (dqpout) - break; - } - - xfs_qm_freelist_unlock(xfs_Gqm); - return dqpout; -} - - -/*------------------------------------------------------------------*/ - -/* - * Return a new incore dquot. Depending on the number of - * dquots in the system, we either allocate a new one on the kernel heap, - * or reclaim a free one. - * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed - * to reclaim an existing one from the freelist. - */ -boolean_t -xfs_qm_dqalloc_incore( - xfs_dquot_t **O_dqpp) -{ - xfs_dquot_t *dqp; - - /* - * Check against high water mark to see if we want to pop - * a nincompoop dquot off the freelist. - */ - if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) { - /* - * Try to recycle a dquot from the freelist. - */ - if ((dqp = xfs_qm_dqreclaim_one())) { - XQM_STATS_INC(xqmstats.xs_qm_dqreclaims); - /* - * Just zero the core here. The rest will get - * reinitialized by caller. XXX we shouldn't even - * do this zero ... - */ - memset(&dqp->q_core, 0, sizeof(dqp->q_core)); - *O_dqpp = dqp; - return B_FALSE; - } - XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses); - } - - /* - * Allocate a brand new dquot on the kernel heap and return it - * to the caller to initialize. - */ - ASSERT(xfs_Gqm->qm_dqzone != NULL); - *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP); - atomic_inc(&xfs_Gqm->qm_totaldquots); - - return B_TRUE; -} - - -/* - * Start a transaction and write the incore superblock changes to - * disk. flags parameter indicates which fields have changed. - */ -int -xfs_qm_write_sb_changes( - xfs_mount_t *mp, - __int64_t flags) -{ - xfs_trans_t *tp; - int error; - -#ifdef QUOTADEBUG - cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); -#endif - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, - mp->m_sb.sb_sectsize + 128, 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_mod_sb(tp, flags); - (void) xfs_trans_commit(tp, 0, NULL); - - return 0; -} - - -/* --------------- utility functions for vnodeops ---------------- */ - - -/* - * Given an inode, a uid and gid (from cred_t) make sure that we have - * allocated relevant dquot(s) on disk, and that we won't exceed inode - * quotas by creating this file. - * This also attaches dquot(s) to the given inode after locking it, - * and returns the dquots corresponding to the uid and/or gid. - * - * in : inode (unlocked) - * out : udquot, gdquot with references taken and unlocked - */ -int -xfs_qm_vop_dqalloc( - xfs_mount_t *mp, - xfs_inode_t *ip, - uid_t uid, - gid_t gid, - prid_t prid, - uint flags, - xfs_dquot_t **O_udqpp, - xfs_dquot_t **O_gdqpp) -{ - int error; - xfs_dquot_t *uq, *gq; - uint lockflags; - - if (!XFS_IS_QUOTA_ON(mp)) - return 0; - - lockflags = XFS_ILOCK_EXCL; - xfs_ilock(ip, lockflags); - - if ((flags & XFS_QMOPT_INHERIT) && - XFS_INHERIT_GID(ip, XFS_MTOVFS(mp))) - gid = ip->i_d.di_gid; - - /* - * Attach the dquot(s) to this inode, doing a dquot allocation - * if necessary. The dquot(s) will not be locked. - */ - if (XFS_NOT_DQATTACHED(mp, ip)) { - if ((error = xfs_qm_dqattach(ip, XFS_QMOPT_DQALLOC | - XFS_QMOPT_ILOCKED))) { - xfs_iunlock(ip, lockflags); - return error; - } - } - - uq = gq = NULL; - if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_d.di_uid != uid) { - /* - * What we need is the dquot that has this uid, and - * if we send the inode to dqget, the uid of the inode - * takes priority over what's sent in the uid argument. - * We must unlock inode here before calling dqget if - * we're not sending the inode, because otherwise - * we'll deadlock by doing trans_reserve while - * holding ilock. - */ - xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid, - XFS_DQ_USER, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, - &uq))) { - ASSERT(error != ENOENT); - return error; - } - /* - * Get the ilock in the right order. - */ - xfs_dqunlock(uq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); - } else { - /* - * Take an extra reference, because we'll return - * this to caller - */ - ASSERT(ip->i_udquot); - uq = ip->i_udquot; - xfs_dqlock(uq); - XFS_DQHOLD(uq); - xfs_dqunlock(uq); - } - } - if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { - if (ip->i_d.di_gid != gid) { - xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid, - XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); - ASSERT(error != ENOENT); - return error; - } - xfs_dqunlock(gq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); - } else { - ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); - } - } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { - if (ip->i_d.di_projid != prid) { - xfs_iunlock(ip, lockflags); - if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, - XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC | - XFS_QMOPT_DOWARN, - &gq))) { - if (uq) - xfs_qm_dqrele(uq); - ASSERT(error != ENOENT); - return (error); - } - xfs_dqunlock(gq); - lockflags = XFS_ILOCK_SHARED; - xfs_ilock(ip, lockflags); - } else { - ASSERT(ip->i_gdquot); - gq = ip->i_gdquot; - xfs_dqlock(gq); - XFS_DQHOLD(gq); - xfs_dqunlock(gq); - } - } - if (uq) - xfs_dqtrace_entry_ino(uq, "DQALLOC", ip); - - xfs_iunlock(ip, lockflags); - if (O_udqpp) - *O_udqpp = uq; - else if (uq) - xfs_qm_dqrele(uq); - if (O_gdqpp) - *O_gdqpp = gq; - else if (gq) - xfs_qm_dqrele(gq); - return 0; -} - -/* - * Actually transfer ownership, and do dquot modifications. - * These were already reserved. - */ -xfs_dquot_t * -xfs_qm_vop_chown( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t **IO_olddq, - xfs_dquot_t *newdq) -{ - xfs_dquot_t *prevdq; - uint bfield = XFS_IS_REALTIME_INODE(ip) ? - XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); - - /* old dquot */ - prevdq = *IO_olddq; - ASSERT(prevdq); - ASSERT(prevdq != newdq); - - xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); - xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); - - /* the sparkling new dquot */ - xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); - xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); - - /* - * Take an extra reference, because the inode - * is going to keep this dquot pointer even - * after the trans_commit. - */ - xfs_dqlock(newdq); - XFS_DQHOLD(newdq); - xfs_dqunlock(newdq); - *IO_olddq = newdq; - - return prevdq; -} - -/* - * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). - */ -int -xfs_qm_vop_chown_reserve( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - uint flags) -{ - int error; - xfs_mount_t *mp; - uint delblks, blkflags, prjflags = 0; - xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - - ASSERT(XFS_ISLOCKED_INODE(ip)); - mp = ip->i_mount; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - delblks = ip->i_delayed_blks; - delblksudq = delblksgdq = unresudq = unresgdq = NULL; - blkflags = XFS_IS_REALTIME_INODE(ip) ? - XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; - - if (XFS_IS_UQUOTA_ON(mp) && udqp && - ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) { - delblksudq = udqp; - /* - * If there are delayed allocation blocks, then we have to - * unreserve those from the old dquot, and add them to the - * new dquot. - */ - if (delblks) { - ASSERT(ip->i_udquot); - unresudq = ip->i_udquot; - } - } - if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { - if (XFS_IS_PQUOTA_ON(ip->i_mount) && - ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) - prjflags = XFS_QMOPT_ENOSPC; - - if (prjflags || - (XFS_IS_GQUOTA_ON(ip->i_mount) && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { - delblksgdq = gdqp; - if (delblks) { - ASSERT(ip->i_gdquot); - unresgdq = ip->i_gdquot; - } - } - } - - if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, - delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags | prjflags))) - return (error); - - /* - * Do the delayed blks reservations/unreservations now. Since, these - * are done without the help of a transaction, if a reservation fails - * its previous reservations won't be automatically undone by trans - * code. So, we have to do it manually here. - */ - if (delblks) { - /* - * Do the reservations first. Unreservation can't fail. - */ - ASSERT(delblksudq || delblksgdq); - ASSERT(unresudq || unresgdq); - if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags | prjflags))) - return (error); - xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, - unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, - blkflags); - } - - return (0); -} - -int -xfs_qm_vop_rename_dqattach( - xfs_inode_t **i_tab) -{ - xfs_inode_t *ip; - int i; - int error; - - ip = i_tab[0]; - - if (! XFS_IS_QUOTA_ON(ip->i_mount)) - return 0; - - if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - } - for (i = 1; (i < 4 && i_tab[i]); i++) { - /* - * Watch out for duplicate entries in the table. - */ - if ((ip = i_tab[i]) != i_tab[i-1]) { - if (XFS_NOT_DQATTACHED(ip->i_mount, ip)) { - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; - } - } - } - return 0; -} - -void -xfs_qm_vop_dqattach_and_dqmod_newinode( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp) -{ - if (!XFS_IS_QUOTA_ON(tp->t_mountp)) - return; - - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); - - if (udqp) { - xfs_dqlock(udqp); - XFS_DQHOLD(udqp); - xfs_dqunlock(udqp); - ASSERT(ip->i_udquot == NULL); - ip->i_udquot = udqp; - ASSERT(XFS_IS_UQUOTA_ON(tp->t_mountp)); - ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); - xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); - } - if (gdqp) { - xfs_dqlock(gdqp); - XFS_DQHOLD(gdqp); - xfs_dqunlock(gdqp); - ASSERT(ip->i_gdquot == NULL); - ip->i_gdquot = gdqp; - ASSERT(XFS_IS_OQUOTA_ON(tp->t_mountp)); - ASSERT((XFS_IS_GQUOTA_ON(tp->t_mountp) ? - ip->i_d.di_gid : ip->i_d.di_projid) == - be32_to_cpu(gdqp->q_core.d_id)); - xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); - } -} - -/* ------------- list stuff -----------------*/ -STATIC void -xfs_qm_freelist_init(xfs_frlist_t *ql) -{ - ql->qh_next = ql->qh_prev = (xfs_dquot_t *) ql; - mutex_init(&ql->qh_lock); - ql->qh_version = 0; - ql->qh_nelems = 0; -} - -STATIC void -xfs_qm_freelist_destroy(xfs_frlist_t *ql) -{ - xfs_dquot_t *dqp, *nextdqp; - - mutex_lock(&ql->qh_lock); - for (dqp = ql->qh_next; - dqp != (xfs_dquot_t *)ql; ) { - xfs_dqlock(dqp); - nextdqp = dqp->dq_flnext; -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); -#endif - XQM_FREELIST_REMOVE(dqp); - xfs_dqunlock(dqp); - xfs_qm_dqdestroy(dqp); - dqp = nextdqp; - } - mutex_unlock(&ql->qh_lock); - mutex_destroy(&ql->qh_lock); - - ASSERT(ql->qh_nelems == 0); -} - -STATIC void -xfs_qm_freelist_insert(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - dq->dq_flnext = ql->qh_next; - dq->dq_flprev = (xfs_dquot_t *)ql; - ql->qh_next = dq; - dq->dq_flnext->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems++; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_unlink(xfs_dquot_t *dq) -{ - xfs_dquot_t *next = dq->dq_flnext; - xfs_dquot_t *prev = dq->dq_flprev; - - next->dq_flprev = prev; - prev->dq_flnext = next; - dq->dq_flnext = dq->dq_flprev = dq; - xfs_Gqm->qm_dqfreelist.qh_nelems--; - xfs_Gqm->qm_dqfreelist.qh_version++; -} - -void -xfs_qm_freelist_append(xfs_frlist_t *ql, xfs_dquot_t *dq) -{ - xfs_qm_freelist_insert((xfs_frlist_t *)ql->qh_prev, dq); -} - -STATIC int -xfs_qm_dqhashlock_nowait( - xfs_dquot_t *dqp) -{ - int locked; - - locked = mutex_trylock(&((dqp)->q_hash->qh_lock)); - return locked; -} - -int -xfs_qm_freelist_lock_nowait( - xfs_qm_t *xqm) -{ - int locked; - - locked = mutex_trylock(&(xqm->qm_dqfreelist.qh_lock)); - return locked; -} - -STATIC int -xfs_qm_mplist_nowait( - xfs_mount_t *mp) -{ - int locked; - - ASSERT(mp->m_quotainfo); - locked = mutex_trylock(&(XFS_QI_MPLLOCK(mp))); - return locked; -} diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h deleted file mode 100644 index 4568deb6da8..00000000000 --- a/fs/xfs/quota/xfs_qm.h +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QM_H__ -#define __XFS_QM_H__ - -#include "xfs_dquot_item.h" -#include "xfs_dquot.h" -#include "xfs_quota_priv.h" -#include "xfs_qm_stats.h" - -struct xfs_qm; -struct xfs_inode; - -extern uint ndquot; -extern mutex_t xfs_Gqm_lock; -extern struct xfs_qm *xfs_Gqm; -extern kmem_zone_t *qm_dqzone; -extern kmem_zone_t *qm_dqtrxzone; - -/* - * Used in xfs_qm_sync called by xfs_sync to count the max times that it can - * iterate over the mountpt's dquot list in one call. - */ -#define XFS_QM_SYNC_MAX_RESTARTS 7 - -/* - * Ditto, for xfs_qm_dqreclaim_one. - */ -#define XFS_QM_RECLAIM_MAX_RESTARTS 4 - -/* - * Ideal ratio of free to in use dquots. Quota manager makes an attempt - * to keep this balance. - */ -#define XFS_QM_DQFREE_RATIO 2 - -/* - * Dquot hashtable constants/threshold values. - */ -#define XFS_QM_HASHSIZE_LOW (NBPP / sizeof(xfs_dqhash_t)) -#define XFS_QM_HASHSIZE_HIGH ((NBPP * 4) / sizeof(xfs_dqhash_t)) - -/* - * We output a cmn_err when quotachecking a quota file with more than - * this many fsbs. - */ -#define XFS_QM_BIG_QCHECK_NBLKS 500 - -/* - * This defines the unit of allocation of dquots. - * Currently, it is just one file system block, and a 4K blk contains 30 - * (136 * 30 = 4080) dquots. It's probably not worth trying to make - * this more dynamic. - * XXXsup However, if this number is changed, we have to make sure that we don't - * implicitly assume that we do allocations in chunks of a single filesystem - * block in the dquot/xqm code. - */ -#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 -/* - * When doing a quotacheck, we log dquot clusters of this many FSBs at most - * in a single transaction. We don't want to ask for too huge a log reservation. - */ -#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3 - -typedef xfs_dqhash_t xfs_dqlist_t; -/* - * The freelist head. The first two fields match the first two in the - * xfs_dquot_t structure (in xfs_dqmarker_t) - */ -typedef struct xfs_frlist { - struct xfs_dquot *qh_next; - struct xfs_dquot *qh_prev; - mutex_t qh_lock; - uint qh_version; - uint qh_nelems; -} xfs_frlist_t; - -/* - * Quota Manager (global) structure. Lives only in core. - */ -typedef struct xfs_qm { - xfs_dqlist_t *qm_usr_dqhtable;/* udquot hash table */ - xfs_dqlist_t *qm_grp_dqhtable;/* gdquot hash table */ - uint qm_dqhashmask; /* # buckets in dq hashtab - 1 */ - xfs_frlist_t qm_dqfreelist; /* freelist of dquots */ - atomic_t qm_totaldquots; /* total incore dquots */ - uint qm_nrefs; /* file systems with quota on */ - int qm_dqfree_ratio;/* ratio of free to inuse dquots */ - kmem_zone_t *qm_dqzone; /* dquot mem-alloc zone */ - kmem_zone_t *qm_dqtrxzone; /* t_dqinfo of transactions */ -} xfs_qm_t; - -/* - * Various quota information for individual filesystems. - * The mount structure keeps a pointer to this. - */ -typedef struct xfs_quotainfo { - xfs_inode_t *qi_uquotaip; /* user quota inode */ - xfs_inode_t *qi_gquotaip; /* group quota inode */ - lock_t qi_pinlock; /* dquot pinning mutex */ - xfs_dqlist_t qi_dqlist; /* all dquots in filesys */ - int qi_dqreclaims; /* a change here indicates - a removal in the dqlist */ - time_t qi_btimelimit; /* limit for blks timer */ - time_t qi_itimelimit; /* limit for inodes timer */ - time_t qi_rtbtimelimit;/* limit for rt blks timer */ - xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ - xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ - xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ - mutex_t qi_quotaofflock;/* to serialize quotaoff */ - xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ - uint qi_dqperchunk; /* # ondisk dqs in above chunk */ - xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ - xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ - xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ - xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ - xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ - xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ -} xfs_quotainfo_t; - - -extern xfs_dqtrxops_t xfs_trans_dquot_ops; - -extern void xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long); -extern int xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *, - xfs_dquot_t *, xfs_dquot_t *, long, long, uint); -extern void xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *); -extern void xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *); - -/* - * We keep the usr and grp dquots separately so that locking will be easier - * to do at commit time. All transactions that we know of at this point - * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. - */ -#define XFS_QM_TRANS_MAXDQS 2 -typedef struct xfs_dquot_acct { - xfs_dqtrx_t dqa_usrdquots[XFS_QM_TRANS_MAXDQS]; - xfs_dqtrx_t dqa_grpdquots[XFS_QM_TRANS_MAXDQS]; -} xfs_dquot_acct_t; - -/* - * Users are allowed to have a usage exceeding their softlimit for - * a period this long. - */ -#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ -#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ - -#define XFS_QM_BWARNLIMIT 5 -#define XFS_QM_IWARNLIMIT 5 -#define XFS_QM_RTBWARNLIMIT 5 - -#define XFS_QM_LOCK(xqm) (mutex_lock(&xqm##_lock)) -#define XFS_QM_UNLOCK(xqm) (mutex_unlock(&xqm##_lock)) -#define XFS_QM_HOLD(xqm) ((xqm)->qm_nrefs++) -#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) - -extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); -extern int xfs_qm_mount_quotas(xfs_mount_t *, int); -extern void xfs_qm_mount_quotainit(xfs_mount_t *, uint); -extern int xfs_qm_quotacheck(xfs_mount_t *); -extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); -extern int xfs_qm_unmount_quotas(xfs_mount_t *); -extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t); -extern int xfs_qm_sync(xfs_mount_t *, short); - -/* dquot stuff */ -extern boolean_t xfs_qm_dqalloc_incore(xfs_dquot_t **); -extern int xfs_qm_dqattach(xfs_inode_t *, uint); -extern void xfs_qm_dqdetach(xfs_inode_t *); -extern int xfs_qm_dqpurge_all(xfs_mount_t *, uint); -extern void xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint); - -/* vop stuff */ -extern int xfs_qm_vop_dqalloc(xfs_mount_t *, xfs_inode_t *, - uid_t, gid_t, prid_t, uint, - xfs_dquot_t **, xfs_dquot_t **); -extern void xfs_qm_vop_dqattach_and_dqmod_newinode( - xfs_trans_t *, xfs_inode_t *, - xfs_dquot_t *, xfs_dquot_t *); -extern int xfs_qm_vop_rename_dqattach(xfs_inode_t **); -extern xfs_dquot_t * xfs_qm_vop_chown(xfs_trans_t *, xfs_inode_t *, - xfs_dquot_t **, xfs_dquot_t *); -extern int xfs_qm_vop_chown_reserve(xfs_trans_t *, xfs_inode_t *, - xfs_dquot_t *, xfs_dquot_t *, uint); - -/* list stuff */ -extern void xfs_qm_freelist_append(xfs_frlist_t *, xfs_dquot_t *); -extern void xfs_qm_freelist_unlink(xfs_dquot_t *); -extern int xfs_qm_freelist_lock_nowait(xfs_qm_t *); - -/* system call interface */ -extern int xfs_qm_quotactl(bhv_desc_t *, int, int, xfs_caddr_t); - -#ifdef DEBUG -extern int xfs_qm_internalqcheck(xfs_mount_t *); -#else -#define xfs_qm_internalqcheck(mp) (0) -#endif - -#endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c deleted file mode 100644 index db8872be8c8..00000000000 --- a/fs/xfs/quota/xfs_qm_bhv.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_clnt.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_qm.h" - -#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ -#define MNTOPT_NOQUOTA "noquota" /* no quotas */ -#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ -#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ -#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ -#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ -#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ -#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ -#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ -#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ -#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ -#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ - -STATIC int -xfs_qm_parseargs( - struct bhv_desc *bhv, - char *options, - struct xfs_mount_args *args, - int update) -{ - size_t length; - char *local_options = options; - char *this_char; - int error; - int referenced = update; - - while ((this_char = strsep(&local_options, ",")) != NULL) { - length = strlen(this_char); - if (local_options) - length++; - - if (!strcmp(this_char, MNTOPT_NOQUOTA)) { - args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA); - args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA); - referenced = update; - } else if (!strcmp(this_char, MNTOPT_QUOTA) || - !strcmp(this_char, MNTOPT_UQUOTA) || - !strcmp(this_char, MNTOPT_USRQUOTA)) { - args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || - !strcmp(this_char, MNTOPT_UQUOTANOENF)) { - args->flags |= XFSMNT_UQUOTA; - args->flags &= ~XFSMNT_UQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_PQUOTA) || - !strcmp(this_char, MNTOPT_PRJQUOTA)) { - args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { - args->flags |= XFSMNT_PQUOTA; - args->flags &= ~XFSMNT_PQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_GQUOTA) || - !strcmp(this_char, MNTOPT_GRPQUOTA)) { - args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF; - referenced = 1; - } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { - args->flags |= XFSMNT_GQUOTA; - args->flags &= ~XFSMNT_GQUOTAENF; - referenced = 1; - } else { - if (local_options) - *(local_options-1) = ','; - continue; - } - - while (length--) - *this_char++ = ','; - } - - if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) { - cmn_err(CE_WARN, - "XFS: cannot mount with both project and group quota"); - return XFS_ERROR(EINVAL); - } - - error = bhv_next_vfs_parseargs(BHV_NEXT(bhv), options, args, update); - if (!error && !referenced) - bhv_remove_vfsops(bhvtovfs(bhv), VFS_POSITION_QM); - return error; -} - -STATIC int -xfs_qm_showargs( - struct bhv_desc *bhv, - struct seq_file *m) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - - if (mp->m_qflags & XFS_UQUOTA_ACCT) { - (mp->m_qflags & XFS_UQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_USRQUOTA) : - seq_puts(m, "," MNTOPT_UQUOTANOENF); - } - - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - (mp->m_qflags & XFS_OQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_PRJQUOTA) : - seq_puts(m, "," MNTOPT_PQUOTANOENF); - } - - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - (mp->m_qflags & XFS_OQUOTA_ENFD) ? - seq_puts(m, "," MNTOPT_GRPQUOTA) : - seq_puts(m, "," MNTOPT_GQUOTANOENF); - } - - if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) - seq_puts(m, "," MNTOPT_NOQUOTA); - - return bhv_next_vfs_showargs(BHV_NEXT(bhv), m); -} - -STATIC int -xfs_qm_mount( - struct bhv_desc *bhv, - struct xfs_mount_args *args, - struct cred *cr) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - - if (args->flags & (XFSMNT_UQUOTA | XFSMNT_GQUOTA | XFSMNT_PQUOTA)) - xfs_qm_mount_quotainit(mp, args->flags); - return bhv_next_vfs_mount(BHV_NEXT(bhv), args, cr); -} - -/* - * Directory tree accounting is implemented using project quotas, where - * the project identifier is inherited from parent directories. - * A statvfs (df, etc.) of a directory that is using project quota should - * return a statvfs of the project, not the entire filesystem. - * This makes such trees appear as if they are filesystems in themselves. - */ -STATIC int -xfs_qm_statvfs( - struct bhv_desc *bhv, - bhv_statvfs_t *statp, - struct bhv_vnode *vnode) -{ - xfs_mount_t *mp; - xfs_inode_t *ip; - xfs_dquot_t *dqp; - xfs_disk_dquot_t *dp; - __uint64_t limit; - int error; - - error = bhv_next_vfs_statvfs(BHV_NEXT(bhv), statp, vnode); - if (error || !vnode) - return error; - - mp = xfs_vfstom(bhvtovfs(bhv)); - ip = xfs_vtoi(vnode); - - if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) - return 0; - if (!(mp->m_qflags & XFS_PQUOTA_ACCT)) - return 0; - if (!(mp->m_qflags & XFS_OQUOTA_ENFD)) - return 0; - - if (xfs_qm_dqget(mp, NULL, ip->i_d.di_projid, XFS_DQ_PROJ, 0, &dqp)) - return 0; - dp = &dqp->q_core; - - limit = dp->d_blk_softlimit ? - be64_to_cpu(dp->d_blk_softlimit) : - be64_to_cpu(dp->d_blk_hardlimit); - if (limit && statp->f_blocks > limit) { - statp->f_blocks = limit; - statp->f_bfree = - (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ? - (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0; - } - - limit = dp->d_ino_softlimit ? - be64_to_cpu(dp->d_ino_softlimit) : - be64_to_cpu(dp->d_ino_hardlimit); - if (limit && statp->f_files > limit) { - statp->f_files = limit; - statp->f_ffree = - (statp->f_files > be64_to_cpu(dp->d_icount)) ? - (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0; - } - - xfs_qm_dqput(dqp); - return 0; -} - -STATIC int -xfs_qm_syncall( - struct bhv_desc *bhv, - int flags, - cred_t *credp) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhv); - struct xfs_mount *mp = XFS_VFSTOM(vfsp); - int error; - - /* - * Get the Quota Manager to flush the dquots. - */ - if (XFS_IS_QUOTA_ON(mp)) { - if ((error = xfs_qm_sync(mp, flags))) { - /* - * If we got an IO error, we will be shutting down. - * So, there's nothing more for us to do here. - */ - ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp)); - if (XFS_FORCED_SHUTDOWN(mp)) { - return XFS_ERROR(error); - } - } - } - return bhv_next_vfs_sync(BHV_NEXT(bhv), flags, credp); -} - -STATIC int -xfs_qm_newmount( - xfs_mount_t *mp, - uint *needquotamount, - uint *quotaflags) -{ - uint quotaondisk; - uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; - - *quotaflags = 0; - *needquotamount = B_FALSE; - - quotaondisk = XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); - - if (quotaondisk) { - uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; - pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; - gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; - } - - /* - * If the device itself is read-only, we can't allow - * the user to change the state of quota on the mount - - * this would generate a transaction on the ro device, - * which would lead to an I/O error and shutdown - */ - - if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || - (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || - (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || - (!pquotaondisk && XFS_IS_PQUOTA_ON(mp)) || - (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || - (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && - xfs_dev_is_read_only(mp, "changing quota state")) { - cmn_err(CE_WARN, - "XFS: please mount with%s%s%s%s.", - (!quotaondisk ? "out quota" : ""), - (uquotaondisk ? " usrquota" : ""), - (pquotaondisk ? " prjquota" : ""), - (gquotaondisk ? " grpquota" : "")); - return XFS_ERROR(EPERM); - } - - if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { - /* - * Call mount_quotas at this point only if we won't have to do - * a quotacheck. - */ - if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { - /* - * If an error occured, qm_mount_quotas code - * has already disabled quotas. So, just finish - * mounting, and get on with the boring life - * without disk quotas. - */ - xfs_qm_mount_quotas(mp, 0); - } else { - /* - * Clear the quota flags, but remember them. This - * is so that the quota code doesn't get invoked - * before we're ready. This can happen when an - * inode goes inactive and wants to free blocks, - * or via xfs_log_mount_finish. - */ - *needquotamount = B_TRUE; - *quotaflags = mp->m_qflags; - mp->m_qflags = 0; - } - } - - return 0; -} - -STATIC int -xfs_qm_endmount( - xfs_mount_t *mp, - uint needquotamount, - uint quotaflags, - int mfsi_flags) -{ - if (needquotamount) { - ASSERT(mp->m_qflags == 0); - mp->m_qflags = quotaflags; - xfs_qm_mount_quotas(mp, mfsi_flags); - } - -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) - if (! (XFS_IS_QUOTA_ON(mp))) - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas not turned on"); - else - xfs_fs_cmn_err(CE_NOTE, mp, "Disk quotas turned on"); -#endif - -#ifdef QUOTADEBUG - if (XFS_IS_QUOTA_ON(mp) && xfs_qm_internalqcheck(mp)) - cmn_err(CE_WARN, "XFS: mount internalqcheck failed"); -#endif - - return 0; -} - -STATIC void -xfs_qm_dqrele_null( - xfs_dquot_t *dq) -{ - /* - * Called from XFS, where we always check first for a NULL dquot. - */ - if (!dq) - return; - xfs_qm_dqrele(dq); -} - - -STATIC struct xfs_qmops xfs_qmcore_xfs = { - .xfs_qminit = xfs_qm_newmount, - .xfs_qmdone = xfs_qm_unmount_quotadestroy, - .xfs_qmmount = xfs_qm_endmount, - .xfs_qmunmount = xfs_qm_unmount_quotas, - .xfs_dqrele = xfs_qm_dqrele_null, - .xfs_dqattach = xfs_qm_dqattach, - .xfs_dqdetach = xfs_qm_dqdetach, - .xfs_dqpurgeall = xfs_qm_dqpurge_all, - .xfs_dqvopalloc = xfs_qm_vop_dqalloc, - .xfs_dqvopcreate = xfs_qm_vop_dqattach_and_dqmod_newinode, - .xfs_dqvoprename = xfs_qm_vop_rename_dqattach, - .xfs_dqvopchown = xfs_qm_vop_chown, - .xfs_dqvopchownresv = xfs_qm_vop_chown_reserve, - .xfs_dqtrxops = &xfs_trans_dquot_ops, -}; - -struct bhv_module_vfsops xfs_qmops = { { - BHV_IDENTITY_INIT(VFS_BHV_QM, VFS_POSITION_QM), - .vfs_parseargs = xfs_qm_parseargs, - .vfs_showargs = xfs_qm_showargs, - .vfs_mount = xfs_qm_mount, - .vfs_statvfs = xfs_qm_statvfs, - .vfs_sync = xfs_qm_syncall, - .vfs_quotactl = xfs_qm_quotactl, }, -}; - - -void __init -xfs_qm_init(void) -{ - static char message[] __initdata = - KERN_INFO "SGI XFS Quota Management subsystem\n"; - - printk(message); - mutex_init(&xfs_Gqm_lock); - vfs_bhv_set_custom(&xfs_qmops, &xfs_qmcore_xfs); - xfs_qm_init_procfs(); -} - -void __exit -xfs_qm_exit(void) -{ - vfs_bhv_clr_custom(&xfs_qmops); - xfs_qm_cleanup_procfs(); - if (qm_dqzone) - kmem_zone_destroy(qm_dqzone); - if (qm_dqtrxzone) - kmem_zone_destroy(qm_dqtrxzone); -} diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c deleted file mode 100644 index 6f858fb81a3..00000000000 --- a/fs/xfs/quota/xfs_qm_stats.c +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_qm.h" - -struct xqmstats xqmstats; - -STATIC int -xfs_qm_read_xfsquota( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) -{ - int len; - - /* maximum; incore; ratio free to inuse; freelist */ - len = sprintf(buffer, "%d\t%d\t%d\t%u\n", - ndquot, - xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, - xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, - xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); - - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; -} - -STATIC int -xfs_qm_read_stats( - char *buffer, - char **start, - off_t offset, - int count, - int *eof, - void *data) -{ - int len; - - /* quota performance statistics */ - len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", - xqmstats.xs_qm_dqreclaims, - xqmstats.xs_qm_dqreclaim_misses, - xqmstats.xs_qm_dquot_dups, - xqmstats.xs_qm_dqcachemisses, - xqmstats.xs_qm_dqcachehits, - xqmstats.xs_qm_dqwants, - xqmstats.xs_qm_dqshake_reclaims, - xqmstats.xs_qm_dqinact_reclaims); - - if (offset >= len) { - *start = buffer; - *eof = 1; - return 0; - } - *start = buffer + offset; - if ((len -= offset) > count) - return count; - *eof = 1; - - return len; -} - -void -xfs_qm_init_procfs(void) -{ - create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); - create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); -} - -void -xfs_qm_cleanup_procfs(void) -{ - remove_proc_entry("fs/xfs/xqm", NULL); - remove_proc_entry("fs/xfs/xqmstat", NULL); -} diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h deleted file mode 100644 index a50ffabcf55..00000000000 --- a/fs/xfs/quota/xfs_qm_stats.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2002 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QM_STATS_H__ -#define __XFS_QM_STATS_H__ - -#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) - -/* - * XQM global statistics - */ -struct xqmstats { - __uint32_t xs_qm_dqreclaims; - __uint32_t xs_qm_dqreclaim_misses; - __uint32_t xs_qm_dquot_dups; - __uint32_t xs_qm_dqcachemisses; - __uint32_t xs_qm_dqcachehits; - __uint32_t xs_qm_dqwants; - __uint32_t xs_qm_dqshake_reclaims; - __uint32_t xs_qm_dqinact_reclaims; -}; - -extern struct xqmstats xqmstats; - -# define XQM_STATS_INC(count) ( (count)++ ) - -extern void xfs_qm_init_procfs(void); -extern void xfs_qm_cleanup_procfs(void); - -#else - -# define XQM_STATS_INC(count) do { } while (0) - -static __inline void xfs_qm_init_procfs(void) { }; -static __inline void xfs_qm_cleanup_procfs(void) { }; - -#endif - -#endif /* __XFS_QM_STATS_H__ */ diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c deleted file mode 100644 index ed620c4d159..00000000000 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ /dev/null @@ -1,1481 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/capability.h> - -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_rtalloc.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" -#include "xfs_utils.h" -#include "xfs_qm.h" - -#ifdef DEBUG -# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args) -#else -# define qdprintk(s, args...) do { } while (0) -#endif - -STATIC int xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *); -STATIC int xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint, - fs_disk_quota_t *); -STATIC int xfs_qm_scall_quotaon(xfs_mount_t *, uint); -STATIC int xfs_qm_scall_quotaoff(xfs_mount_t *, uint, boolean_t); -STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); -STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, - uint); -STATIC uint xfs_qm_import_flags(uint); -STATIC uint xfs_qm_export_flags(uint); -STATIC uint xfs_qm_import_qtype_flags(uint); -STATIC uint xfs_qm_export_qtype_flags(uint); -STATIC void xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *, - fs_disk_quota_t *); - - -/* - * The main distribution switch of all XFS quotactl system calls. - */ -int -xfs_qm_quotactl( - struct bhv_desc *bdp, - int cmd, - int id, - xfs_caddr_t addr) -{ - xfs_mount_t *mp; - bhv_vfs_t *vfsp; - int error; - - vfsp = bhvtovfs(bdp); - mp = XFS_VFSTOM(vfsp); - - ASSERT(addr != NULL || cmd == Q_XQUOTASYNC); - - /* - * The following commands are valid even when quotaoff. - */ - switch (cmd) { - case Q_XQUOTARM: - /* - * Truncate quota files. quota must be off. - */ - if (XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(EINVAL); - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_trunc_qfiles(mp, - xfs_qm_import_qtype_flags(*(uint *)addr))); - - case Q_XGETQSTAT: - /* - * Get quota status information. - */ - return (xfs_qm_scall_getqstat(mp, (fs_quota_stat_t *)addr)); - - case Q_XQUOTAON: - /* - * QUOTAON - enabling quota enforcement. - * Quota accounting must be turned on at mount time. - */ - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - return (xfs_qm_scall_quotaon(mp, - xfs_qm_import_flags(*(uint *)addr))); - - case Q_XQUOTAOFF: - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - break; - - case Q_XQUOTASYNC: - return (xfs_sync_inodes(mp, SYNC_DELWRI, 0, NULL)); - - default: - break; - } - - if (! XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(ESRCH); - - switch (cmd) { - case Q_XQUOTAOFF: - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_quotaoff(mp, - xfs_qm_import_flags(*(uint *)addr), - B_FALSE); - break; - - case Q_XGETQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XGETGQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XGETPQUOTA: - error = xfs_qm_scall_getquota(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - case Q_XSETQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_USER, - (fs_disk_quota_t *)addr); - break; - case Q_XSETGQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_GROUP, - (fs_disk_quota_t *)addr); - break; - case Q_XSETPQLIM: - if (vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - error = xfs_qm_scall_setqlim(mp, (xfs_dqid_t)id, XFS_DQ_PROJ, - (fs_disk_quota_t *)addr); - break; - - default: - error = XFS_ERROR(EINVAL); - break; - } - - return (error); -} - -/* - * Turn off quota accounting and/or enforcement for all udquots and/or - * gdquots. Called only at unmount time. - * - * This assumes that there are no dquots of this file system cached - * incore, and modifies the ondisk dquot directly. Therefore, for example, - * it is an error to call this twice, without purging the cache. - */ -STATIC int -xfs_qm_scall_quotaoff( - xfs_mount_t *mp, - uint flags, - boolean_t force) -{ - uint dqtype; - unsigned long s; - int error; - uint inactivate_flags; - xfs_qoff_logitem_t *qoffstart; - int nculprits; - - if (!force && !capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - /* - * No file system can have quotas enabled on disk but not in core. - * Note that quota utilities (like quotaoff) _expect_ - * errno == EEXIST here. - */ - if ((mp->m_qflags & flags) == 0) - return XFS_ERROR(EEXIST); - error = 0; - - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); - - /* - * We don't want to deal with two quotaoffs messing up each other, - * so we're going to serialize it. quotaoff isn't exactly a performance - * critical thing. - * If quotaoff, then we must be dealing with the root filesystem. - */ - ASSERT(mp->m_quotainfo); - if (mp->m_quotainfo) - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - ASSERT(mp->m_quotainfo); - - /* - * If we're just turning off quota enforcement, change mp and go. - */ - if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { - mp->m_qflags &= ~(flags); - - s = XFS_SB_LOCK(mp); - mp->m_sb.sb_qflags = mp->m_qflags; - XFS_SB_UNLOCK(mp, s); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - - /* XXX what to do if error ? Revert back to old vals incore ? */ - error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); - return (error); - } - - dqtype = 0; - inactivate_flags = 0; - /* - * If accounting is off, we must turn enforcement off, clear the - * quota 'CHKD' certificate to make it known that we have to - * do a quotacheck the next time this quota is turned on. - */ - if (flags & XFS_UQUOTA_ACCT) { - dqtype |= XFS_QMOPT_UQUOTA; - flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); - inactivate_flags |= XFS_UQUOTA_ACTIVE; - } - if (flags & XFS_GQUOTA_ACCT) { - dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); - inactivate_flags |= XFS_GQUOTA_ACTIVE; - } else if (flags & XFS_PQUOTA_ACCT) { - dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD); - inactivate_flags |= XFS_PQUOTA_ACTIVE; - } - - /* - * Nothing to do? Don't complain. This happens when we're just - * turning off quota enforcement. - */ - if ((mp->m_qflags & flags) == 0) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - return (0); - } - - /* - * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. - */ - xfs_qm_log_quotaoff(mp, &qoffstart, flags); - - /* - * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct - * to take care of the race between dqget and quotaoff. We don't take - * any special locks to reset these bits. All processes need to check - * these bits *after* taking inode lock(s) to see if the particular - * quota type is in the process of being turned off. If *ACTIVE, it is - * guaranteed that all dquot structures and all quotainode ptrs will all - * stay valid as long as that inode is kept locked. - * - * There is no turning back after this. - */ - mp->m_qflags &= ~inactivate_flags; - - /* - * Give back all the dquot reference(s) held by inodes. - * Here we go thru every single incore inode in this file system, and - * do a dqrele on the i_udquot/i_gdquot that it may have. - * Essentially, as long as somebody has an inode locked, this guarantees - * that quotas will not be turned off. This is handy because in a - * transaction once we lock the inode(s) and check for quotaon, we can - * depend on the quota inodes (and other things) being valid as long as - * we keep the lock(s). - */ - xfs_qm_dqrele_all_inodes(mp, flags); - - /* - * Next we make the changes in the quota flag in the mount struct. - * This isn't protected by a particular lock directly, because we - * don't want to take a mrlock everytime we depend on quotas being on. - */ - mp->m_qflags &= ~(flags); - - /* - * Go through all the dquots of this file system and purge them, - * according to what was turned off. We may not be able to get rid - * of all dquots, because dquots can have temporary references that - * are not attached to inodes. eg. xfs_setattr, xfs_create. - * So, if we couldn't purge all the dquots from the filesystem, - * we can't get rid of the incore data structures. - */ - while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype|XFS_QMOPT_QUOTAOFF))) - delay(10 * nculprits); - - /* - * Transactions that had started before ACTIVE state bit was cleared - * could have logged many dquots, so they'd have higher LSNs than - * the first QUOTAOFF log record does. If we happen to crash when - * the tail of the log has gone past the QUOTAOFF record, but - * before the last dquot modification, those dquots __will__ - * recover, and that's not good. - * - * So, we have QUOTAOFF start and end logitems; the start - * logitem won't get overwritten until the end logitem appears... - */ - xfs_qm_log_quotaoff_end(mp, qoffstart, flags); - - /* - * If quotas is completely disabled, close shop. - */ - if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) || - ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) { - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - xfs_qm_destroy_quotainfo(mp); - return (0); - } - - /* - * Release our quotainode references, and vn_purge them, - * if we don't need them anymore. - */ - if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_UQIP(mp)); - XFS_QI_UQIP(mp) = NULL; - } - if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) { - XFS_PURGE_INODE(XFS_QI_GQIP(mp)); - XFS_QI_GQIP(mp) = NULL; - } - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - - return (error); -} - -STATIC int -xfs_qm_scall_trunc_qfiles( - xfs_mount_t *mp, - uint flags) -{ - int error; - xfs_inode_t *qip; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - error = 0; - if (!XFS_SB_VERSION_HASQUOTA(&mp->m_sb) || flags == 0) { - qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); - return XFS_ERROR(EINVAL); - } - - if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); - } - } - - if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && - mp->m_sb.sb_gquotino != NULLFSINO) { - error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); - if (! error) { - (void) xfs_truncate_file(mp, qip); - VN_RELE(XFS_ITOV(qip)); - } - } - - return (error); -} - - -/* - * Switch on (a given) quota enforcement for a filesystem. This takes - * effect immediately. - * (Switching on quota accounting must be done at mount time.) - */ -STATIC int -xfs_qm_scall_quotaon( - xfs_mount_t *mp, - uint flags) -{ - int error; - unsigned long s; - uint qf; - uint accflags; - __int64_t sbflags; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); - /* - * Switching on quota accounting must be done at mount time. - */ - accflags = flags & XFS_ALL_QUOTA_ACCT; - flags &= ~(XFS_ALL_QUOTA_ACCT); - - sbflags = 0; - - if (flags == 0) { - qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); - return XFS_ERROR(EINVAL); - } - - /* No fs can turn on quotas with a delayed effect */ - ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0); - - /* - * Can't enforce without accounting. We check the superblock - * qflags here instead of m_qflags because rootfs can have - * quota acct on ondisk without m_qflags' knowing. - */ - if (((flags & XFS_UQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && - (flags & XFS_UQUOTA_ENFD)) - || - ((flags & XFS_PQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD)) - || - ((flags & XFS_GQUOTA_ACCT) == 0 && - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && - (flags & XFS_OQUOTA_ENFD))) { - qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", - flags, mp->m_sb.sb_qflags); - return XFS_ERROR(EINVAL); - } - /* - * If everything's upto-date incore, then don't waste time. - */ - if ((mp->m_qflags & flags) == flags) - return XFS_ERROR(EEXIST); - - /* - * Change sb_qflags on disk but not incore mp->qflags - * if this is the root filesystem. - */ - s = XFS_SB_LOCK(mp); - qf = mp->m_sb.sb_qflags; - mp->m_sb.sb_qflags = qf | flags; - XFS_SB_UNLOCK(mp, s); - - /* - * There's nothing to change if it's the same. - */ - if ((qf & flags) == flags && sbflags == 0) - return XFS_ERROR(EEXIST); - sbflags |= XFS_SB_QFLAGS; - - if ((error = xfs_qm_write_sb_changes(mp, sbflags))) - return (error); - /* - * If we aren't trying to switch on quota enforcement, we are done. - */ - if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != - (mp->m_qflags & XFS_UQUOTA_ACCT)) || - ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != - (mp->m_qflags & XFS_PQUOTA_ACCT)) || - ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != - (mp->m_qflags & XFS_GQUOTA_ACCT)) || - (flags & XFS_ALL_QUOTA_ENFD) == 0) - return (0); - - if (! XFS_IS_QUOTA_RUNNING(mp)) - return XFS_ERROR(ESRCH); - - /* - * Switch on quota enforcement in core. - */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - - return (0); -} - - -/* - * Return quota status information, such as uquota-off, enforcements, etc. - */ -STATIC int -xfs_qm_scall_getqstat( - xfs_mount_t *mp, - fs_quota_stat_t *out) -{ - xfs_inode_t *uip, *gip; - boolean_t tempuqip, tempgqip; - - uip = gip = NULL; - tempuqip = tempgqip = B_FALSE; - memset(out, 0, sizeof(fs_quota_stat_t)); - - out->qs_version = FS_QSTAT_VERSION; - if (! XFS_SB_VERSION_HASQUOTA(&mp->m_sb)) { - out->qs_uquota.qfs_ino = NULLFSINO; - out->qs_gquota.qfs_ino = NULLFSINO; - return (0); - } - out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & - (XFS_ALL_QUOTA_ACCT| - XFS_ALL_QUOTA_ENFD)); - out->qs_pad = 0; - out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; - out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - - if (mp->m_quotainfo) { - uip = mp->m_quotainfo->qi_uquotaip; - gip = mp->m_quotainfo->qi_gquotaip; - } - if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip, 0) == 0) - tempuqip = B_TRUE; - } - if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip, 0) == 0) - tempgqip = B_TRUE; - } - if (uip) { - out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; - out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; - if (tempuqip) - VN_RELE(XFS_ITOV(uip)); - } - if (gip) { - out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; - out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; - if (tempgqip) - VN_RELE(XFS_ITOV(gip)); - } - if (mp->m_quotainfo) { - out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); - out->qs_btimelimit = XFS_QI_BTIMELIMIT(mp); - out->qs_itimelimit = XFS_QI_ITIMELIMIT(mp); - out->qs_rtbtimelimit = XFS_QI_RTBTIMELIMIT(mp); - out->qs_bwarnlimit = XFS_QI_BWARNLIMIT(mp); - out->qs_iwarnlimit = XFS_QI_IWARNLIMIT(mp); - } - return (0); -} - -/* - * Adjust quota limits, and start/stop timers accordingly. - */ -STATIC int -xfs_qm_scall_setqlim( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - fs_disk_quota_t *newlim) -{ - xfs_disk_dquot_t *ddq; - xfs_dquot_t *dqp; - xfs_trans_t *tp; - int error; - xfs_qcnt_t hard, soft; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - - if ((newlim->d_fieldmask & - (FS_DQ_LIMIT_MASK|FS_DQ_TIMER_MASK|FS_DQ_WARNS_MASK)) == 0) - return (0); - - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - return (error); - } - - /* - * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). - */ - mutex_lock(&(XFS_QI_QOFFLOCK(mp))); - - /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. - */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - ASSERT(error != ENOENT); - return (error); - } - xfs_dqtrace_entry(dqp, "Q_SETQLIM: AFT DQGET"); - xfs_trans_dqjoin(tp, dqp); - ddq = &dqp->q_core; - - /* - * Make sure that hardlimits are >= soft limits before changing. - */ - hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : - be64_to_cpu(ddq->d_blk_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : - be64_to_cpu(ddq->d_blk_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_blk_hardlimit = cpu_to_be64(hard); - ddq->d_blk_softlimit = cpu_to_be64(soft); - if (id == 0) { - mp->m_quotainfo->qi_bhardlimit = hard; - mp->m_quotainfo->qi_bsoftlimit = soft; - } - } else { - qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); - } - hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : - be64_to_cpu(ddq->d_rtb_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? - (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : - be64_to_cpu(ddq->d_rtb_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_rtb_hardlimit = cpu_to_be64(hard); - ddq->d_rtb_softlimit = cpu_to_be64(soft); - if (id == 0) { - mp->m_quotainfo->qi_rtbhardlimit = hard; - mp->m_quotainfo->qi_rtbsoftlimit = soft; - } - } else { - qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); - } - - hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? - (xfs_qcnt_t) newlim->d_ino_hardlimit : - be64_to_cpu(ddq->d_ino_hardlimit); - soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? - (xfs_qcnt_t) newlim->d_ino_softlimit : - be64_to_cpu(ddq->d_ino_softlimit); - if (hard == 0 || hard >= soft) { - ddq->d_ino_hardlimit = cpu_to_be64(hard); - ddq->d_ino_softlimit = cpu_to_be64(soft); - if (id == 0) { - mp->m_quotainfo->qi_ihardlimit = hard; - mp->m_quotainfo->qi_isoftlimit = soft; - } - } else { - qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); - } - - /* - * Update warnings counter(s) if requested - */ - if (newlim->d_fieldmask & FS_DQ_BWARNS) - ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns); - if (newlim->d_fieldmask & FS_DQ_IWARNS) - ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns); - if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns); - - if (id == 0) { - /* - * Timelimits for the super user set the relative time - * the other users can be over quota for this file system. - * If it is zero a default is used. Ditto for the default - * soft and hard limit values (already done, above), and - * for warnings. - */ - if (newlim->d_fieldmask & FS_DQ_BTIMER) { - mp->m_quotainfo->qi_btimelimit = newlim->d_btimer; - ddq->d_btimer = cpu_to_be32(newlim->d_btimer); - } - if (newlim->d_fieldmask & FS_DQ_ITIMER) { - mp->m_quotainfo->qi_itimelimit = newlim->d_itimer; - ddq->d_itimer = cpu_to_be32(newlim->d_itimer); - } - if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { - mp->m_quotainfo->qi_rtbtimelimit = newlim->d_rtbtimer; - ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); - } - if (newlim->d_fieldmask & FS_DQ_BWARNS) - mp->m_quotainfo->qi_bwarnlimit = newlim->d_bwarns; - if (newlim->d_fieldmask & FS_DQ_IWARNS) - mp->m_quotainfo->qi_iwarnlimit = newlim->d_iwarns; - if (newlim->d_fieldmask & FS_DQ_RTBWARNS) - mp->m_quotainfo->qi_rtbwarnlimit = newlim->d_rtbwarns; - } else { - /* - * If the user is now over quota, start the timelimit. - * The user will not be 'warned'. - * Note that we keep the timers ticking, whether enforcement - * is on or off. We don't really want to bother with iterating - * over all ondisk dquots and turning the timers on/off. - */ - xfs_qm_adjust_dqtimers(mp, ddq); - } - dqp->dq_flags |= XFS_DQ_DIRTY; - xfs_trans_log_dquot(tp, dqp); - - xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); - xfs_trans_commit(tp, 0, NULL); - xfs_qm_dqprint(dqp); - xfs_qm_dqrele(dqp); - mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); - - return (0); -} - -STATIC int -xfs_qm_scall_getquota( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - fs_disk_quota_t *out) -{ - xfs_dquot_t *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) { - return (error); - } - - xfs_dqtrace_entry(dqp, "Q_GETQUOTA SUCCESS"); - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - return XFS_ERROR(ENOENT); - } - /* xfs_qm_dqprint(dqp); */ - /* - * Convert the disk dquot to the exportable format - */ - xfs_qm_export_dquot(mp, &dqp->q_core, out); - xfs_qm_dqput(dqp); - return (error ? XFS_ERROR(EFAULT) : 0); -} - - -STATIC int -xfs_qm_log_quotaoff_end( - xfs_mount_t *mp, - xfs_qoff_logitem_t *startqoff, - uint flags) -{ - xfs_trans_t *tp; - int error; - xfs_qoff_logitem_t *qoffi; - - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); - - if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2, - 0, 0, XFS_DEFAULT_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - return (error); - } - - qoffi = xfs_trans_get_qoff_item(tp, startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); - return (error); -} - - -STATIC int -xfs_qm_log_quotaoff( - xfs_mount_t *mp, - xfs_qoff_logitem_t **qoffstartp, - uint flags) -{ - xfs_trans_t *tp; - int error; - unsigned long s; - xfs_qoff_logitem_t *qoffi=NULL; - uint oldsbqflag=0; - - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); - if ((error = xfs_trans_reserve(tp, 0, - sizeof(xfs_qoff_logitem_t) * 2 + - mp->m_sb.sb_sectsize + 128, - 0, - 0, - XFS_DEFAULT_LOG_COUNT))) { - goto error0; - } - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - s = XFS_SB_LOCK(mp); - oldsbqflag = mp->m_sb.sb_qflags; - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - XFS_SB_UNLOCK(mp, s); - - xfs_mod_sb(tp, XFS_SB_QFLAGS); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); - -error0: - if (error) { - xfs_trans_cancel(tp, 0); - /* - * No one else is modifying sb_qflags, so this is OK. - * We still hold the quotaofflock. - */ - s = XFS_SB_LOCK(mp); - mp->m_sb.sb_qflags = oldsbqflag; - XFS_SB_UNLOCK(mp, s); - } - *qoffstartp = qoffi; - return (error); -} - - -/* - * Translate an internal style on-disk-dquot to the exportable format. - * The main differences are that the counters/limits are all in Basic - * Blocks (BBs) instead of the internal FSBs, and all on-disk data has - * to be converted to the native endianness. - */ -STATIC void -xfs_qm_export_dquot( - xfs_mount_t *mp, - xfs_disk_dquot_t *src, - struct fs_disk_quota *dst) -{ - memset(dst, 0, sizeof(*dst)); - dst->d_version = FS_DQUOT_VERSION; /* different from src->d_version */ - dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags); - dst->d_id = be32_to_cpu(src->d_id); - dst->d_blk_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit)); - dst->d_blk_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit)); - dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit); - dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit); - dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount)); - dst->d_icount = be64_to_cpu(src->d_icount); - dst->d_btimer = be32_to_cpu(src->d_btimer); - dst->d_itimer = be32_to_cpu(src->d_itimer); - dst->d_iwarns = be16_to_cpu(src->d_iwarns); - dst->d_bwarns = be16_to_cpu(src->d_bwarns); - dst->d_rtb_hardlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit)); - dst->d_rtb_softlimit = - XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit)); - dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount)); - dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer); - dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns); - - /* - * Internally, we don't reset all the timers when quota enforcement - * gets turned off. No need to confuse the user level code, - * so return zeroes in that case. - */ - if (! XFS_IS_QUOTA_ENFORCED(mp)) { - dst->d_btimer = 0; - dst->d_itimer = 0; - dst->d_rtbtimer = 0; - } - -#ifdef DEBUG - if (XFS_IS_QUOTA_ENFORCED(mp) && dst->d_id != 0) { - if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && - (dst->d_blk_softlimit > 0)) { - ASSERT(dst->d_btimer != 0); - } - if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) && - (dst->d_ino_softlimit > 0)) { - ASSERT(dst->d_itimer != 0); - } - } -#endif -} - -STATIC uint -xfs_qm_import_qtype_flags( - uint uflags) -{ - uint oflags = 0; - - /* - * Can't be more than one, or none. - */ - if (((uflags & (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_USER_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_GROUP_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) == - (XFS_USER_QUOTA | XFS_PROJ_QUOTA)) || - ((uflags & (XFS_GROUP_QUOTA|XFS_USER_QUOTA|XFS_PROJ_QUOTA)) == 0)) - return (0); - - oflags |= (uflags & XFS_USER_QUOTA) ? XFS_DQ_USER : 0; - oflags |= (uflags & XFS_PROJ_QUOTA) ? XFS_DQ_PROJ : 0; - oflags |= (uflags & XFS_GROUP_QUOTA) ? XFS_DQ_GROUP: 0; - return oflags; -} - -STATIC uint -xfs_qm_export_qtype_flags( - uint flags) -{ - /* - * Can't be more than one, or none. - */ - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); - - return (flags & XFS_DQ_USER) ? - XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? - XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; -} - -STATIC uint -xfs_qm_import_flags( - uint uflags) -{ - uint flags = 0; - - if (uflags & XFS_QUOTA_UDQ_ACCT) - flags |= XFS_UQUOTA_ACCT; - if (uflags & XFS_QUOTA_PDQ_ACCT) - flags |= XFS_PQUOTA_ACCT; - if (uflags & XFS_QUOTA_GDQ_ACCT) - flags |= XFS_GQUOTA_ACCT; - if (uflags & XFS_QUOTA_UDQ_ENFD) - flags |= XFS_UQUOTA_ENFD; - if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) - flags |= XFS_OQUOTA_ENFD; - return (flags); -} - - -STATIC uint -xfs_qm_export_flags( - uint flags) -{ - uint uflags; - - uflags = 0; - if (flags & XFS_UQUOTA_ACCT) - uflags |= XFS_QUOTA_UDQ_ACCT; - if (flags & XFS_PQUOTA_ACCT) - uflags |= XFS_QUOTA_PDQ_ACCT; - if (flags & XFS_GQUOTA_ACCT) - uflags |= XFS_QUOTA_GDQ_ACCT; - if (flags & XFS_UQUOTA_ENFD) - uflags |= XFS_QUOTA_UDQ_ENFD; - if (flags & (XFS_OQUOTA_ENFD)) { - uflags |= (flags & XFS_GQUOTA_ACCT) ? - XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; - } - return (uflags); -} - - -/* - * Go thru all the inodes in the file system, releasing their dquots. - * Note that the mount structure gets modified to indicate that quotas are off - * AFTER this, in the case of quotaoff. This also gets called from - * xfs_rootumount. - */ -void -xfs_qm_dqrele_all_inodes( - struct xfs_mount *mp, - uint flags) -{ - xfs_inode_t *ip, *topino; - uint ireclaims; - bhv_vnode_t *vp; - boolean_t vnode_refd; - - ASSERT(mp->m_quotainfo); - - XFS_MOUNT_ILOCK(mp); -again: - ip = mp->m_inodes; - if (ip == NULL) { - XFS_MOUNT_IUNLOCK(mp); - return; - } - do { - /* Skip markers inserted by xfs_sync */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - /* Root inode, rbmip and rsumip have associated blocks */ - if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ip = ip->i_mnext; - continue; - } - vp = XFS_ITOV_NULL(ip); - if (!vp) { - ASSERT(ip->i_udquot == NULL); - ASSERT(ip->i_gdquot == NULL); - ip = ip->i_mnext; - continue; - } - vnode_refd = B_FALSE; - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { - ireclaims = mp->m_ireclaims; - topino = mp->m_inodes; - vp = vn_grab(vp); - if (!vp) - goto again; - - XFS_MOUNT_IUNLOCK(mp); - /* XXX restart limit ? */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - vnode_refd = B_TRUE; - } else { - ireclaims = mp->m_ireclaims; - topino = mp->m_inodes; - XFS_MOUNT_IUNLOCK(mp); - } - - /* - * We don't keep the mountlock across the dqrele() call, - * since it can take a while.. - */ - if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - /* - * Wait until we've dropped the ilock and mountlock to - * do the vn_rele. Or be condemned to an eternity in the - * inactive code in hell. - */ - if (vnode_refd) - VN_RELE(vp); - XFS_MOUNT_ILOCK(mp); - /* - * If an inode was inserted or removed, we gotta - * start over again. - */ - if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) { - /* XXX use a sentinel */ - goto again; - } - ip = ip->i_mnext; - } while (ip != mp->m_inodes); - - XFS_MOUNT_IUNLOCK(mp); -} - -/*------------------------------------------------------------------------*/ -#ifdef DEBUG -/* - * This contains all the test functions for XFS disk quotas. - * Currently it does a quota accounting check. ie. it walks through - * all inodes in the file system, calculating the dquot accounting fields, - * and prints out any inconsistencies. - */ -xfs_dqhash_t *qmtest_udqtab; -xfs_dqhash_t *qmtest_gdqtab; -int qmtest_hashmask; -int qmtest_nfails; -mutex_t qcheck_lock; - -#define DQTEST_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ - (__psunsigned_t)(id)) & \ - (qmtest_hashmask - 1)) - -#define DQTEST_HASH(mp, id, type) ((type & XFS_DQ_USER) ? \ - (qmtest_udqtab + \ - DQTEST_HASHVAL(mp, id)) : \ - (qmtest_gdqtab + \ - DQTEST_HASHVAL(mp, id))) - -#define DQTEST_LIST_PRINT(l, NXT, title) \ -{ \ - xfs_dqtest_t *dqp; int i = 0;\ - cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ - for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ - dqp = (xfs_dqtest_t *)dqp->NXT) { \ - cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ - ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ - dqp->d_bcount, dqp->d_icount); } \ -} - -typedef struct dqtest { - xfs_dqmarker_t q_lists; - xfs_dqhash_t *q_hash; /* the hashchain header */ - xfs_mount_t *q_mount; /* filesystem this relates to */ - xfs_dqid_t d_id; /* user id or group id */ - xfs_qcnt_t d_bcount; /* # disk blocks owned by the user */ - xfs_qcnt_t d_icount; /* # inodes owned by the user */ -} xfs_dqtest_t; - -STATIC void -xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp) -{ - xfs_dquot_t *d; - if (((d) = (h)->qh_next)) - (d)->HL_PREVP = &((dqp)->HL_NEXT); - (dqp)->HL_NEXT = d; - (dqp)->HL_PREVP = &((h)->qh_next); - (h)->qh_next = (xfs_dquot_t *)dqp; - (h)->qh_version++; - (h)->qh_nelems++; -} -STATIC void -xfs_qm_dqtest_print( - xfs_dqtest_t *d) -{ - cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); - cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); - cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); - cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", - d->d_bcount, (int)d->d_bcount); - cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", - d->d_icount, (int)d->d_icount); - cmn_err(CE_DEBUG, "---------------------------"); -} - -STATIC void -xfs_qm_dqtest_failed( - xfs_dqtest_t *d, - xfs_dquot_t *dqp, - char *reason, - xfs_qcnt_t a, - xfs_qcnt_t b, - int error) -{ - qmtest_nfails++; - if (error) - cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", - d->d_id, error, reason); - else - cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", - d->d_id, reason, (int)a, (int)b); - xfs_qm_dqtest_print(d); - if (dqp) - xfs_qm_dqprint(dqp); -} - -STATIC int -xfs_dqtest_cmp2( - xfs_dqtest_t *d, - xfs_dquot_t *dqp) -{ - int err = 0; - if (be64_to_cpu(dqp->q_core.d_icount) != d->d_icount) { - xfs_qm_dqtest_failed(d, dqp, "icount mismatch", - be64_to_cpu(dqp->q_core.d_icount), - d->d_icount, 0); - err++; - } - if (be64_to_cpu(dqp->q_core.d_bcount) != d->d_bcount) { - xfs_qm_dqtest_failed(d, dqp, "bcount mismatch", - be64_to_cpu(dqp->q_core.d_bcount), - d->d_bcount, 0); - err++; - } - if (dqp->q_core.d_blk_softlimit && - be64_to_cpu(dqp->q_core.d_bcount) >= - be64_to_cpu(dqp->q_core.d_blk_softlimit)) { - if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { - cmn_err(CE_DEBUG, - "%d [%s] [0x%p] BLK TIMER NOT STARTED", - d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); - err++; - } - } - if (dqp->q_core.d_ino_softlimit && - be64_to_cpu(dqp->q_core.d_icount) >= - be64_to_cpu(dqp->q_core.d_ino_softlimit)) { - if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { - cmn_err(CE_DEBUG, - "%d [%s] [0x%p] INO TIMER NOT STARTED", - d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); - err++; - } - } -#ifdef QUOTADEBUG - if (!err) { - cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", - d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); - } -#endif - return (err); -} - -STATIC void -xfs_dqtest_cmp( - xfs_dqtest_t *d) -{ - xfs_dquot_t *dqp; - int error; - - /* xfs_qm_dqtest_print(d); */ - if ((error = xfs_qm_dqget(d->q_mount, NULL, d->d_id, d->dq_flags, 0, - &dqp))) { - xfs_qm_dqtest_failed(d, NULL, "dqget failed", 0, 0, error); - return; - } - xfs_dqtest_cmp2(d, dqp); - xfs_qm_dqput(dqp); -} - -STATIC int -xfs_qm_internalqcheck_dqget( - xfs_mount_t *mp, - xfs_dqid_t id, - uint type, - xfs_dqtest_t **O_dq) -{ - xfs_dqtest_t *d; - xfs_dqhash_t *h; - - h = DQTEST_HASH(mp, id, type); - for (d = (xfs_dqtest_t *) h->qh_next; d != NULL; - d = (xfs_dqtest_t *) d->HL_NEXT) { - /* DQTEST_LIST_PRINT(h, HL_NEXT, "@@@@@ dqtestlist @@@@@"); */ - if (d->d_id == id && mp == d->q_mount) { - *O_dq = d; - return (0); - } - } - d = kmem_zalloc(sizeof(xfs_dqtest_t), KM_SLEEP); - d->dq_flags = type; - d->d_id = id; - d->q_mount = mp; - d->q_hash = h; - xfs_qm_hashinsert(h, d); - *O_dq = d; - return (0); -} - -STATIC void -xfs_qm_internalqcheck_get_dquots( - xfs_mount_t *mp, - xfs_dqid_t uid, - xfs_dqid_t projid, - xfs_dqid_t gid, - xfs_dqtest_t **ud, - xfs_dqtest_t **gd) -{ - if (XFS_IS_UQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, uid, XFS_DQ_USER, ud); - if (XFS_IS_GQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, gid, XFS_DQ_GROUP, gd); - else if (XFS_IS_PQUOTA_ON(mp)) - xfs_qm_internalqcheck_dqget(mp, projid, XFS_DQ_PROJ, gd); -} - - -STATIC void -xfs_qm_internalqcheck_dqadjust( - xfs_inode_t *ip, - xfs_dqtest_t *d) -{ - d->d_icount++; - d->d_bcount += (xfs_qcnt_t)ip->i_d.di_nblocks; -} - -STATIC int -xfs_qm_internalqcheck_adjust( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - void __user *buffer, /* not used */ - int ubsize, /* not used */ - void *private_data, /* not used */ - xfs_daddr_t bno, /* starting block of inode cluster */ - int *ubused, /* not used */ - void *dip, /* not used */ - int *res) /* bulkstat result code */ -{ - xfs_inode_t *ip; - xfs_dqtest_t *ud, *gd; - uint lock_flags; - boolean_t ipreleased; - int error; - - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - - if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { - *res = BULKSTAT_RV_NOTHING; - qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", - (unsigned long long) ino, - (unsigned long long) mp->m_sb.sb_uquotino, - (unsigned long long) mp->m_sb.sb_gquotino); - return XFS_ERROR(EINVAL); - } - ipreleased = B_FALSE; - again: - lock_flags = XFS_ILOCK_SHARED; - if ((error = xfs_iget(mp, NULL, ino, 0, lock_flags, &ip, bno))) { - *res = BULKSTAT_RV_NOTHING; - return (error); - } - - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, lock_flags); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - - /* - * This inode can have blocks after eof which can get released - * when we send it to inactive. Since we don't check the dquot - * until the after all our calculations are done, we must get rid - * of those now. - */ - if (! ipreleased) { - xfs_iput(ip, lock_flags); - ipreleased = B_TRUE; - goto again; - } - xfs_qm_internalqcheck_get_dquots(mp, - (xfs_dqid_t) ip->i_d.di_uid, - (xfs_dqid_t) ip->i_d.di_projid, - (xfs_dqid_t) ip->i_d.di_gid, - &ud, &gd); - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(ud); - xfs_qm_internalqcheck_dqadjust(ip, ud); - } - if (XFS_IS_OQUOTA_ON(mp)) { - ASSERT(gd); - xfs_qm_internalqcheck_dqadjust(ip, gd); - } - xfs_iput(ip, lock_flags); - *res = BULKSTAT_RV_DIDONE; - return (0); -} - - -/* PRIVATE, debugging */ -int -xfs_qm_internalqcheck( - xfs_mount_t *mp) -{ - xfs_ino_t lastino; - int done, count; - int i; - xfs_dqtest_t *d, *e; - xfs_dqhash_t *h1; - int error; - - lastino = 0; - qmtest_hashmask = 32; - count = 5; - done = 0; - qmtest_nfails = 0; - - if (! XFS_IS_QUOTA_ON(mp)) - return XFS_ERROR(ESRCH); - - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); - XFS_bflush(mp->m_ddev_targp); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); - XFS_bflush(mp->m_ddev_targp); - - mutex_lock(&qcheck_lock); - /* There should be absolutely no quota activity while this - is going on. */ - qmtest_udqtab = kmem_zalloc(qmtest_hashmask * - sizeof(xfs_dqhash_t), KM_SLEEP); - qmtest_gdqtab = kmem_zalloc(qmtest_hashmask * - sizeof(xfs_dqhash_t), KM_SLEEP); - do { - /* - * Iterate thru all the inodes in the file system, - * adjusting the corresponding dquot counters - */ - if ((error = xfs_bulkstat(mp, &lastino, &count, - xfs_qm_internalqcheck_adjust, NULL, - 0, NULL, BULKSTAT_FG_IGET, &done))) { - break; - } - } while (! done); - if (error) { - cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); - } - cmn_err(CE_DEBUG, "Checking results against system dquots"); - for (i = 0; i < qmtest_hashmask; i++) { - h1 = &qmtest_udqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { - xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); - d = e; - } - h1 = &qmtest_gdqtab[i]; - for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) { - xfs_dqtest_cmp(d); - e = (xfs_dqtest_t *) d->HL_NEXT; - kmem_free(d, sizeof(xfs_dqtest_t)); - d = e; - } - } - - if (qmtest_nfails) { - cmn_err(CE_DEBUG, "******** quotacheck failed ********"); - cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); - } else { - cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); - } - kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); - kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t)); - mutex_unlock(&qcheck_lock); - return (qmtest_nfails); -} - -#endif /* DEBUG */ diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h deleted file mode 100644 index b7ddd04aae3..00000000000 --- a/fs/xfs/quota/xfs_quota_priv.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2000-2003 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_QUOTA_PRIV_H__ -#define __XFS_QUOTA_PRIV_H__ - -/* - * Number of bmaps that we ask from bmapi when doing a quotacheck. - * We make this restriction to keep the memory usage to a minimum. - */ -#define XFS_DQITER_MAP_SIZE 10 - -/* Number of dquots that fit in to a dquot block */ -#define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) - -#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE | MR_ACCESS) != 0) -#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE) != 0) - -#define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) - -#define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) -#define XFS_QI_UQIP(mp) ((mp)->m_quotainfo->qi_uquotaip) -#define XFS_QI_GQIP(mp) ((mp)->m_quotainfo->qi_gquotaip) -#define XFS_QI_DQCHUNKLEN(mp) ((mp)->m_quotainfo->qi_dqchunklen) -#define XFS_QI_BTIMELIMIT(mp) ((mp)->m_quotainfo->qi_btimelimit) -#define XFS_QI_RTBTIMELIMIT(mp) ((mp)->m_quotainfo->qi_rtbtimelimit) -#define XFS_QI_ITIMELIMIT(mp) ((mp)->m_quotainfo->qi_itimelimit) -#define XFS_QI_BWARNLIMIT(mp) ((mp)->m_quotainfo->qi_bwarnlimit) -#define XFS_QI_RTBWARNLIMIT(mp) ((mp)->m_quotainfo->qi_rtbwarnlimit) -#define XFS_QI_IWARNLIMIT(mp) ((mp)->m_quotainfo->qi_iwarnlimit) -#define XFS_QI_QOFFLOCK(mp) ((mp)->m_quotainfo->qi_quotaofflock) - -#define XFS_QI_MPL_LIST(mp) ((mp)->m_quotainfo->qi_dqlist) -#define XFS_QI_MPLLOCK(mp) ((mp)->m_quotainfo->qi_dqlist.qh_lock) -#define XFS_QI_MPLNEXT(mp) ((mp)->m_quotainfo->qi_dqlist.qh_next) -#define XFS_QI_MPLNDQUOTS(mp) ((mp)->m_quotainfo->qi_dqlist.qh_nelems) - -#define XQMLCK(h) (mutex_lock(&((h)->qh_lock))) -#define XQMUNLCK(h) (mutex_unlock(&((h)->qh_lock))) -#ifdef DEBUG -struct xfs_dqhash; -static inline int XQMISLCKD(struct xfs_dqhash *h) -{ - if (mutex_trylock(&h->qh_lock)) { - mutex_unlock(&h->qh_lock); - return 0; - } - return 1; -} -#endif - -#define XFS_DQ_HASH_LOCK(h) XQMLCK(h) -#define XFS_DQ_HASH_UNLOCK(h) XQMUNLCK(h) -#define XFS_DQ_IS_HASH_LOCKED(h) XQMISLCKD(h) - -#define xfs_qm_mplist_lock(mp) XQMLCK(&(XFS_QI_MPL_LIST(mp))) -#define xfs_qm_mplist_unlock(mp) XQMUNLCK(&(XFS_QI_MPL_LIST(mp))) -#define XFS_QM_IS_MPLIST_LOCKED(mp) XQMISLCKD(&(XFS_QI_MPL_LIST(mp))) - -#define xfs_qm_freelist_lock(qm) XQMLCK(&((qm)->qm_dqfreelist)) -#define xfs_qm_freelist_unlock(qm) XQMUNLCK(&((qm)->qm_dqfreelist)) -#define XFS_QM_IS_FREELIST_LOCKED(qm) XQMISLCKD(&((qm)->qm_dqfreelist)) - -/* - * Hash into a bucket in the dquot hash table, based on <mp, id>. - */ -#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \ - (__psunsigned_t)(id)) & \ - (xfs_Gqm->qm_dqhashmask - 1)) -#define XFS_DQ_HASH(mp, id, type) (type == XFS_DQ_USER ? \ - (xfs_Gqm->qm_usr_dqhtable + \ - XFS_DQ_HASHVAL(mp, id)) : \ - (xfs_Gqm->qm_grp_dqhtable + \ - XFS_DQ_HASHVAL(mp, id))) -#define XFS_IS_DQTYPE_ON(mp, type) (type == XFS_DQ_USER ? \ - XFS_IS_UQUOTA_ON(mp) : \ - XFS_IS_OQUOTA_ON(mp)) -#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ - !dqp->q_core.d_blk_hardlimit && \ - !dqp->q_core.d_blk_softlimit && \ - !dqp->q_core.d_rtb_hardlimit && \ - !dqp->q_core.d_rtb_softlimit && \ - !dqp->q_core.d_ino_hardlimit && \ - !dqp->q_core.d_ino_softlimit && \ - !dqp->q_core.d_bcount && \ - !dqp->q_core.d_rtbcount && \ - !dqp->q_core.d_icount) - -#define HL_PREVP dq_hashlist.ql_prevp -#define HL_NEXT dq_hashlist.ql_next -#define MPL_PREVP dq_mplist.ql_prevp -#define MPL_NEXT dq_mplist.ql_next - - -#define _LIST_REMOVE(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (dqp)->NXT)) \ - (d)->PVP = (dqp)->PVP; \ - *((dqp)->PVP) = d; \ - (dqp)->NXT = NULL; \ - (dqp)->PVP = NULL; \ - (h)->qh_version++; \ - (h)->qh_nelems--; \ - } - -#define _LIST_INSERT(h, dqp, PVP, NXT) \ - { \ - xfs_dquot_t *d; \ - if (((d) = (h)->qh_next)) \ - (d)->PVP = &((dqp)->NXT); \ - (dqp)->NXT = d; \ - (dqp)->PVP = &((h)->qh_next); \ - (h)->qh_next = dqp; \ - (h)->qh_version++; \ - (h)->qh_nelems++; \ - } - -#define FOREACH_DQUOT_IN_MP(dqp, mp) \ - for ((dqp) = XFS_QI_MPLNEXT(mp); (dqp) != NULL; (dqp) = (dqp)->MPL_NEXT) - -#define FOREACH_DQUOT_IN_FREELIST(dqp, qlist) \ -for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \ - (dqp) = (dqp)->dq_flnext) - -#define XQM_HASHLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, HL_PREVP, HL_NEXT) - -#define XQM_FREELIST_INSERT(h, dqp) \ - xfs_qm_freelist_append(h, dqp) - -#define XQM_MPLIST_INSERT(h, dqp) \ - _LIST_INSERT(h, dqp, MPL_PREVP, MPL_NEXT) - -#define XQM_HASHLIST_REMOVE(h, dqp) \ - _LIST_REMOVE(h, dqp, HL_PREVP, HL_NEXT) -#define XQM_FREELIST_REMOVE(dqp) \ - xfs_qm_freelist_unlink(dqp) -#define XQM_MPLIST_REMOVE(h, dqp) \ - { _LIST_REMOVE(h, dqp, MPL_PREVP, MPL_NEXT); \ - XFS_QI_MPLRECLAIMS((dqp)->q_mount)++; } - -#define XFS_DQ_IS_LOGITEM_INITD(dqp) ((dqp)->q_logitem.qli_dquot == (dqp)) - -#define XFS_QM_DQP_TO_DQACCT(tp, dqp) (XFS_QM_ISUDQ(dqp) ? \ - (tp)->t_dqinfo->dqa_usrdquots : \ - (tp)->t_dqinfo->dqa_grpdquots) -#define XFS_IS_SUSER_DQUOT(dqp) \ - (!((dqp)->q_core.d_id)) - -#define XFS_PURGE_INODE(ip) \ - IRELE(ip); - -#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \ - (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \ - (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???"))) -#define DQFLAGTO_DIRTYSTR(d) (XFS_DQ_IS_DIRTY(d) ? "DIRTY" : "NOTDIRTY") - -#endif /* __XFS_QUOTA_PRIV_H__ */ diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c deleted file mode 100644 index 36fbeccdc72..00000000000 --- a/fs/xfs/support/debug.c +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "debug.h" -#include "spin.h" -#include <asm/page.h> -#include <linux/sched.h> -#include <linux/kernel.h> - -static char message[256]; /* keep it off the stack */ -static DEFINE_SPINLOCK(xfs_err_lock); - -/* Translate from CE_FOO to KERN_FOO, err_level(CE_FOO) == KERN_FOO */ -#define XFS_MAX_ERR_LEVEL 7 -#define XFS_ERR_MASK ((1 << 3) - 1) -static const char * const err_level[XFS_MAX_ERR_LEVEL+1] = - {KERN_EMERG, KERN_ALERT, KERN_CRIT, - KERN_ERR, KERN_WARNING, KERN_NOTICE, - KERN_INFO, KERN_DEBUG}; - -void -cmn_err(register int level, char *fmt, ...) -{ - char *fp = fmt; - int len; - ulong flags; - va_list ap; - - level &= XFS_ERR_MASK; - if (level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - va_start(ap, fmt); - if (*fmt == '!') fp++; - len = vsprintf(message, fp, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); - printk("%s%s", err_level[level], message); - va_end(ap); - spin_unlock_irqrestore(&xfs_err_lock,flags); - - if (level == CE_PANIC) - BUG(); -} - -void -icmn_err(register int level, char *fmt, va_list ap) -{ - ulong flags; - int len; - - level &= XFS_ERR_MASK; - if(level > XFS_MAX_ERR_LEVEL) - level = XFS_MAX_ERR_LEVEL; - spin_lock_irqsave(&xfs_err_lock,flags); - len = vsprintf(message, fmt, ap); - if (level != CE_DEBUG && message[len-1] != '\n') - strcat(message, "\n"); - spin_unlock_irqrestore(&xfs_err_lock,flags); - printk("%s%s", err_level[level], message); - if (level == CE_PANIC) - BUG(); -} - -void -assfail(char *expr, char *file, int line) -{ - printk("Assertion failed: %s, file: %s, line: %d\n", expr, file, line); - BUG(); -} - -#if ((defined(DEBUG) || defined(INDUCE_IO_ERRROR)) && !defined(NO_WANT_RANDOM)) -unsigned long random(void) -{ - static unsigned long RandomValue = 1; - /* cycles pseudo-randomly through all values between 1 and 2^31 - 2 */ - register long rv = RandomValue; - register long lo; - register long hi; - - hi = rv / 127773; - lo = rv % 127773; - rv = 16807 * lo - 2836 * hi; - if (rv <= 0) rv += 2147483647; - return RandomValue = rv; -} -#endif /* DEBUG || INDUCE_IO_ERRROR || !NO_WANT_RANDOM */ diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h deleted file mode 100644 index 4f54dca662a..00000000000 --- a/fs/xfs/support/debug.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_DEBUG_H__ -#define __XFS_SUPPORT_DEBUG_H__ - -#include <stdarg.h> - -#define CE_DEBUG 7 /* debug */ -#define CE_CONT 6 /* continuation */ -#define CE_NOTE 5 /* notice */ -#define CE_WARN 4 /* warning */ -#define CE_ALERT 1 /* alert */ -#define CE_PANIC 0 /* panic */ - -extern void icmn_err(int, char *, va_list) - __attribute__ ((format (printf, 2, 0))); -extern void cmn_err(int, char *, ...) - __attribute__ ((format (printf, 2, 3))); -extern void assfail(char *expr, char *f, int l); - -#define ASSERT_ALWAYS(expr) \ - (unlikely((expr) != 0) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) - -#ifndef DEBUG -# define ASSERT(expr) ((void)0) -#else -# define ASSERT(expr) ASSERT_ALWAYS(expr) -extern unsigned long random(void); -#endif - -#ifndef STATIC -# define STATIC static -#endif - -#endif /* __XFS_SUPPORT_DEBUG_H__ */ diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c deleted file mode 100644 index addf5a7ea06..00000000000 --- a/fs/xfs/support/ktrace.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -static kmem_zone_t *ktrace_hdr_zone; -static kmem_zone_t *ktrace_ent_zone; -static int ktrace_zentries; - -void -ktrace_init(int zentries) -{ - ktrace_zentries = zentries; - - ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), - "ktrace_hdr"); - ASSERT(ktrace_hdr_zone); - - ktrace_ent_zone = kmem_zone_init(ktrace_zentries - * sizeof(ktrace_entry_t), - "ktrace_ent"); - ASSERT(ktrace_ent_zone); -} - -void -ktrace_uninit(void) -{ - kmem_zone_destroy(ktrace_hdr_zone); - kmem_zone_destroy(ktrace_ent_zone); -} - -/* - * ktrace_alloc() - * - * Allocate a ktrace header and enough buffering for the given - * number of entries. - */ -ktrace_t * -ktrace_alloc(int nentries, unsigned int __nocast sleep) -{ - ktrace_t *ktp; - ktrace_entry_t *ktep; - - ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); - - if (ktp == (ktrace_t*)NULL) { - /* - * KM_SLEEP callers don't expect failure. - */ - if (sleep & KM_SLEEP) - panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - - return NULL; - } - - /* - * Special treatment for buffers with the ktrace_zentries entries - */ - if (nentries == ktrace_zentries) { - ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, - sleep); - } else { - ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), - sleep); - } - - if (ktep == NULL) { - /* - * KM_SLEEP callers don't expect failure. - */ - if (sleep & KM_SLEEP) - panic("ktrace_alloc: NULL memory on KM_SLEEP request!"); - - kmem_free(ktp, sizeof(*ktp)); - - return NULL; - } - - spinlock_init(&(ktp->kt_lock), "kt_lock"); - - ktp->kt_entries = ktep; - ktp->kt_nentries = nentries; - ktp->kt_index = 0; - ktp->kt_rollover = 0; - return ktp; -} - - -/* - * ktrace_free() - * - * Free up the ktrace header and buffer. It is up to the caller - * to ensure that no-one is referencing it. - */ -void -ktrace_free(ktrace_t *ktp) -{ - int entries_size; - - if (ktp == (ktrace_t *)NULL) - return; - - spinlock_destroy(&ktp->kt_lock); - - /* - * Special treatment for the Vnode trace buffer. - */ - if (ktp->kt_nentries == ktrace_zentries) { - kmem_zone_free(ktrace_ent_zone, ktp->kt_entries); - } else { - entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t)); - - kmem_free(ktp->kt_entries, entries_size); - } - - kmem_zone_free(ktrace_hdr_zone, ktp); -} - - -/* - * Enter the given values into the "next" entry in the trace buffer. - * kt_index is always the index of the next entry to be filled. - */ -void -ktrace_enter( - ktrace_t *ktp, - void *val0, - void *val1, - void *val2, - void *val3, - void *val4, - void *val5, - void *val6, - void *val7, - void *val8, - void *val9, - void *val10, - void *val11, - void *val12, - void *val13, - void *val14, - void *val15) -{ - static DEFINE_SPINLOCK(wrap_lock); - unsigned long flags; - int index; - ktrace_entry_t *ktep; - - ASSERT(ktp != NULL); - - /* - * Grab an entry by pushing the index up to the next one. - */ - spin_lock_irqsave(&wrap_lock, flags); - index = ktp->kt_index; - if (++ktp->kt_index == ktp->kt_nentries) - ktp->kt_index = 0; - spin_unlock_irqrestore(&wrap_lock, flags); - - if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) - ktp->kt_rollover = 1; - - ASSERT((index >= 0) && (index < ktp->kt_nentries)); - - ktep = &(ktp->kt_entries[index]); - - ktep->val[0] = val0; - ktep->val[1] = val1; - ktep->val[2] = val2; - ktep->val[3] = val3; - ktep->val[4] = val4; - ktep->val[5] = val5; - ktep->val[6] = val6; - ktep->val[7] = val7; - ktep->val[8] = val8; - ktep->val[9] = val9; - ktep->val[10] = val10; - ktep->val[11] = val11; - ktep->val[12] = val12; - ktep->val[13] = val13; - ktep->val[14] = val14; - ktep->val[15] = val15; -} - -/* - * Return the number of entries in the trace buffer. - */ -int -ktrace_nentries( - ktrace_t *ktp) -{ - if (ktp == NULL) { - return 0; - } - - return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); -} - -/* - * ktrace_first() - * - * This is used to find the start of the trace buffer. - * In conjunction with ktrace_next() it can be used to - * iterate through the entire trace buffer. This code does - * not do any locking because it is assumed that it is called - * from the debugger. - * - * The caller must pass in a pointer to a ktrace_snap - * structure in which we will keep some state used to - * iterate through the buffer. This state must not touched - * by any code outside of this module. - */ -ktrace_entry_t * -ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp) -{ - ktrace_entry_t *ktep; - int index; - int nentries; - - if (ktp->kt_rollover) - index = ktp->kt_index; - else - index = 0; - - ktsp->ks_start = index; - ktep = &(ktp->kt_entries[index]); - - nentries = ktrace_nentries(ktp); - index++; - if (index < nentries) { - ktsp->ks_index = index; - } else { - ktsp->ks_index = 0; - if (index > nentries) - ktep = NULL; - } - return ktep; -} - -/* - * ktrace_next() - * - * This is used to iterate through the entries of the given - * trace buffer. The caller must pass in the ktrace_snap_t - * structure initialized by ktrace_first(). The return value - * will be either a pointer to the next ktrace_entry or NULL - * if all of the entries have been traversed. - */ -ktrace_entry_t * -ktrace_next( - ktrace_t *ktp, - ktrace_snap_t *ktsp) -{ - int index; - ktrace_entry_t *ktep; - - index = ktsp->ks_index; - if (index == ktsp->ks_start) { - ktep = NULL; - } else { - ktep = &ktp->kt_entries[index]; - } - - index++; - if (index == ktrace_nentries(ktp)) { - ktsp->ks_index = 0; - } else { - ktsp->ks_index = index; - } - - return ktep; -} - -/* - * ktrace_skip() - * - * Skip the next "count" entries and return the entry after that. - * Return NULL if this causes us to iterate past the beginning again. - */ -ktrace_entry_t * -ktrace_skip( - ktrace_t *ktp, - int count, - ktrace_snap_t *ktsp) -{ - int index; - int new_index; - ktrace_entry_t *ktep; - int nentries = ktrace_nentries(ktp); - - index = ktsp->ks_index; - new_index = index + count; - while (new_index >= nentries) { - new_index -= nentries; - } - if (index == ktsp->ks_start) { - /* - * We've iterated around to the start, so we're done. - */ - ktep = NULL; - } else if ((new_index < index) && (index < ktsp->ks_index)) { - /* - * We've skipped past the start again, so we're done. - */ - ktep = NULL; - ktsp->ks_index = ktsp->ks_start; - } else { - ktep = &(ktp->kt_entries[new_index]); - new_index++; - if (new_index == nentries) { - ktsp->ks_index = 0; - } else { - ktsp->ks_index = new_index; - } - } - return ktep; -} diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h deleted file mode 100644 index 0d73216287c..00000000000 --- a/fs/xfs/support/ktrace.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_KTRACE_H__ -#define __XFS_SUPPORT_KTRACE_H__ - -#include <spin.h> - -/* - * Trace buffer entry structure. - */ -typedef struct ktrace_entry { - void *val[16]; -} ktrace_entry_t; - -/* - * Trace buffer header structure. - */ -typedef struct ktrace { - lock_t kt_lock; /* mutex to guard counters */ - int kt_nentries; /* number of entries in trace buf */ - int kt_index; /* current index in entries */ - int kt_rollover; - ktrace_entry_t *kt_entries; /* buffer of entries */ -} ktrace_t; - -/* - * Trace buffer snapshot structure. - */ -typedef struct ktrace_snap { - int ks_start; /* kt_index at time of snap */ - int ks_index; /* current index */ -} ktrace_snap_t; - - -#ifdef CONFIG_XFS_TRACE - -extern void ktrace_init(int zentries); -extern void ktrace_uninit(void); - -extern ktrace_t *ktrace_alloc(int, unsigned int __nocast); -extern void ktrace_free(ktrace_t *); - -extern void ktrace_enter( - ktrace_t *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *, - void *); - -extern ktrace_entry_t *ktrace_first(ktrace_t *, ktrace_snap_t *); -extern int ktrace_nentries(ktrace_t *); -extern ktrace_entry_t *ktrace_next(ktrace_t *, ktrace_snap_t *); -extern ktrace_entry_t *ktrace_skip(ktrace_t *, int, ktrace_snap_t *); - -#else -#define ktrace_init(x) do { } while (0) -#define ktrace_uninit() do { } while (0) -#endif /* CONFIG_XFS_TRACE */ - -#endif /* __XFS_SUPPORT_KTRACE_H__ */ diff --git a/fs/xfs/support/move.c b/fs/xfs/support/move.c deleted file mode 100644 index caefa17b80f..00000000000 --- a/fs/xfs/support/move.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include <xfs.h> - -/* Read from kernel buffer at src to user/kernel buffer defined - * by the uio structure. Advance the pointer in the uio struct - * as we go. - */ -int -uio_read(caddr_t src, size_t len, struct uio *uio) -{ - size_t count; - - if (!len || !uio->uio_resid) - return 0; - - count = uio->uio_iov->iov_len; - if (!count) - return 0; - if (count > len) - count = len; - - if (uio->uio_segflg == UIO_USERSPACE) { - if (copy_to_user(uio->uio_iov->iov_base, src, count)) - return EFAULT; - } else { - ASSERT(uio->uio_segflg == UIO_SYSSPACE); - memcpy(uio->uio_iov->iov_base, src, count); - } - - uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count); - uio->uio_iov->iov_len -= count; - uio->uio_offset += count; - uio->uio_resid -= count; - return 0; -} diff --git a/fs/xfs/support/move.h b/fs/xfs/support/move.h deleted file mode 100644 index 97a2498d2da..00000000000 --- a/fs/xfs/support/move.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Portions Copyright (c) 1982, 1986, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ -#ifndef __XFS_SUPPORT_MOVE_H__ -#define __XFS_SUPPORT_MOVE_H__ - -#include <linux/uio.h> -#include <asm/uaccess.h> - -/* Segment flag values. */ -enum uio_seg { - UIO_USERSPACE, /* from user data space */ - UIO_SYSSPACE, /* from system space */ -}; - -struct uio { - struct iovec *uio_iov; /* pointer to array of iovecs */ - int uio_iovcnt; /* number of iovecs in array */ - xfs_off_t uio_offset; /* offset in file this uio corresponds to */ - int uio_resid; /* residual i/o count */ - enum uio_seg uio_segflg; /* see above */ -}; - -typedef struct uio uio_t; -typedef struct iovec iovec_t; - -extern int uio_read (caddr_t, size_t, uio_t *); - -#endif /* __XFS_SUPPORT_MOVE_H__ */ diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/time.h index 387e695a184..387e695a184 100644 --- a/fs/xfs/linux-2.6/time.h +++ b/fs/xfs/time.h diff --git a/fs/xfs/support/uuid.c b/fs/xfs/uuid.c index e157015c70f..b83f76b6d41 100644 --- a/fs/xfs/support/uuid.c +++ b/fs/xfs/uuid.c @@ -17,10 +17,6 @@ */ #include <xfs.h> -static mutex_t uuid_monitor; -static int uuid_table_size; -static uuid_t *uuid_table; - /* IRIX interpretation of an uuid_t */ typedef struct { __be32 uu_timelow; @@ -46,12 +42,6 @@ uuid_getnodeuniq(uuid_t *uuid, int fsid [2]) fsid[1] = be32_to_cpu(uup->uu_timelow); } -void -uuid_create_nil(uuid_t *uuid) -{ - memset(uuid, 0, sizeof(*uuid)); -} - int uuid_is_nil(uuid_t *uuid) { @@ -71,70 +61,3 @@ uuid_equal(uuid_t *uuid1, uuid_t *uuid2) { return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1; } - -/* - * Given a 128-bit uuid, return a 64-bit value by adding the top and bottom - * 64-bit words. NOTE: This function can not be changed EVER. Although - * brain-dead, some applications depend on this 64-bit value remaining - * persistent. Specifically, DMI vendors store the value as a persistent - * filehandle. - */ -__uint64_t -uuid_hash64(uuid_t *uuid) -{ - __uint64_t *sp = (__uint64_t *)uuid; - - return sp[0] + sp[1]; -} - -int -uuid_table_insert(uuid_t *uuid) -{ - int i, hole; - - mutex_lock(&uuid_monitor); - for (i = 0, hole = -1; i < uuid_table_size; i++) { - if (uuid_is_nil(&uuid_table[i])) { - hole = i; - continue; - } - if (uuid_equal(uuid, &uuid_table[i])) { - mutex_unlock(&uuid_monitor); - return 0; - } - } - if (hole < 0) { - uuid_table = kmem_realloc(uuid_table, - (uuid_table_size + 1) * sizeof(*uuid_table), - uuid_table_size * sizeof(*uuid_table), - KM_SLEEP); - hole = uuid_table_size++; - } - uuid_table[hole] = *uuid; - mutex_unlock(&uuid_monitor); - return 1; -} - -void -uuid_table_remove(uuid_t *uuid) -{ - int i; - - mutex_lock(&uuid_monitor); - for (i = 0; i < uuid_table_size; i++) { - if (uuid_is_nil(&uuid_table[i])) - continue; - if (!uuid_equal(uuid, &uuid_table[i])) - continue; - uuid_create_nil(&uuid_table[i]); - break; - } - ASSERT(i < uuid_table_size); - mutex_unlock(&uuid_monitor); -} - -void -uuid_init(void) -{ - mutex_init(&uuid_monitor); -} diff --git a/fs/xfs/support/uuid.h b/fs/xfs/uuid.h index b6f5922199b..104db0f3bed 100644 --- a/fs/xfs/support/uuid.h +++ b/fs/xfs/uuid.h @@ -22,13 +22,14 @@ typedef struct { unsigned char __u_bits[16]; } uuid_t; -extern void uuid_init(void); -extern void uuid_create_nil(uuid_t *uuid); extern int uuid_is_nil(uuid_t *uuid); extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2); extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]); -extern __uint64_t uuid_hash64(uuid_t *uuid); -extern int uuid_table_insert(uuid_t *uuid); -extern void uuid_table_remove(uuid_t *uuid); + +static inline void +uuid_copy(uuid_t *dst, uuid_t *src) +{ + memcpy(dst, src, sizeof(uuid_t)); +} #endif /* __XFS_SUPPORT_UUID_H__ */ diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 1a48dbb902a..a742c47f7d5 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -17,5 +17,18 @@ */ #ifndef __XFS_H__ #define __XFS_H__ -#include <linux-2.6/xfs_linux.h> + +#ifdef CONFIG_XFS_DEBUG +#define STATIC +#define DEBUG 1 +#define XFS_BUF_LOCK_TRACKING 1 +#endif + +#ifdef CONFIG_XFS_WARN +#define XFS_WARN 1 +#endif + + +#include "xfs_linux.h" + #endif /* __XFS_H__ */ diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4b0cb474be4..6888ad886ff 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008, Christoph Hellwig * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -16,909 +16,291 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_inode.h" -#include "xfs_btree.h" #include "xfs_acl.h" -#include "xfs_mac.h" #include "xfs_attr.h" - -#include <linux/capability.h> +#include "xfs_trace.h" +#include <linux/slab.h> +#include <linux/xattr.h> #include <linux/posix_acl_xattr.h> -STATIC int xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *); -STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *); -STATIC void xfs_acl_get_endian(xfs_acl_t *); -STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *); -STATIC int xfs_acl_invalid(xfs_acl_t *); -STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *); -STATIC void xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *); -STATIC void xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *); -STATIC int xfs_acl_allow_set(bhv_vnode_t *, int); - -kmem_zone_t *xfs_acl_zone; - - -/* - * Test for existence of access ACL attribute as efficiently as possible. - */ -int -xfs_acl_vhasacl_access( - bhv_vnode_t *vp) -{ - int error; - - xfs_acl_get_attr(vp, NULL, _ACL_TYPE_ACCESS, ATTR_KERNOVAL, &error); - return (error == 0); -} /* - * Test for existence of default ACL attribute as efficiently as possible. + * Locking scheme: + * - all ACL updates are protected by inode->i_mutex, which is taken before + * calling into this file. */ -int -xfs_acl_vhasacl_default( - bhv_vnode_t *vp) -{ - int error; - if (!VN_ISDIR(vp)) - return 0; - xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error); - return (error == 0); -} - -/* - * Convert from extended attribute representation to in-memory for XFS. - */ -STATIC int -posix_acl_xattr_to_xfs( - posix_acl_xattr_header *src, - size_t size, - xfs_acl_t *dest) +STATIC struct posix_acl * +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { - posix_acl_xattr_entry *src_entry; - xfs_acl_entry_t *dest_entry; - int n; - - if (!src || !dest) - return EINVAL; - - if (size < sizeof(posix_acl_xattr_header)) - return EINVAL; + struct posix_acl_entry *acl_e; + struct posix_acl *acl; + struct xfs_acl_entry *ace; + unsigned int count, i; - if (src->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION)) - return EOPNOTSUPP; + count = be32_to_cpu(aclp->acl_cnt); + if (count > max_entries) + return ERR_PTR(-EFSCORRUPTED); - memset(dest, 0, sizeof(xfs_acl_t)); - dest->acl_cnt = posix_acl_xattr_count(size); - if (dest->acl_cnt < 0 || dest->acl_cnt > XFS_ACL_MAX_ENTRIES) - return EINVAL; + acl = posix_acl_alloc(count, GFP_KERNEL); + if (!acl) + return ERR_PTR(-ENOMEM); - /* - * acl_set_file(3) may request that we set default ACLs with - * zero length -- defend (gracefully) against that here. - */ - if (!dest->acl_cnt) - return 0; + for (i = 0; i < count; i++) { + acl_e = &acl->a_entries[i]; + ace = &aclp->acl_entry[i]; - src_entry = (posix_acl_xattr_entry *)((char *)src + sizeof(*src)); - dest_entry = &dest->acl_entry[0]; + /* + * The tag is 32 bits on disk and 16 bits in core. + * + * Because every access to it goes through the core + * format first this is not a problem. + */ + acl_e->e_tag = be32_to_cpu(ace->ae_tag); + acl_e->e_perm = be16_to_cpu(ace->ae_perm); - for (n = 0; n < dest->acl_cnt; n++, src_entry++, dest_entry++) { - dest_entry->ae_perm = le16_to_cpu(src_entry->e_perm); - if (_ACL_PERM_INVALID(dest_entry->ae_perm)) - return EINVAL; - dest_entry->ae_tag = le16_to_cpu(src_entry->e_tag); - switch(dest_entry->ae_tag) { + switch (acl_e->e_tag) { case ACL_USER: + acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id)); + break; case ACL_GROUP: - dest_entry->ae_id = le32_to_cpu(src_entry->e_id); + acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id)); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: - dest_entry->ae_id = ACL_UNDEFINED_ID; break; default: - return EINVAL; + goto fail; } } - if (xfs_acl_invalid(dest)) - return EINVAL; + return acl; - return 0; +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); } -/* - * Comparison function called from xfs_sort(). - * Primary key is ae_tag, secondary key is ae_id. - */ -STATIC int -xfs_acl_entry_compare( - const void *va, - const void *vb) +STATIC void +xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) { - xfs_acl_entry_t *a = (xfs_acl_entry_t *)va, - *b = (xfs_acl_entry_t *)vb; + const struct posix_acl_entry *acl_e; + struct xfs_acl_entry *ace; + int i; - if (a->ae_tag == b->ae_tag) - return (a->ae_id - b->ae_id); - return (a->ae_tag - b->ae_tag); -} + aclp->acl_cnt = cpu_to_be32(acl->a_count); + for (i = 0; i < acl->a_count; i++) { + ace = &aclp->acl_entry[i]; + acl_e = &acl->a_entries[i]; -/* - * Convert from in-memory XFS to extended attribute representation. - */ -STATIC int -posix_acl_xfs_to_xattr( - xfs_acl_t *src, - posix_acl_xattr_header *dest, - size_t size) -{ - int n; - size_t new_size = posix_acl_xattr_size(src->acl_cnt); - posix_acl_xattr_entry *dest_entry; - xfs_acl_entry_t *src_entry; - - if (size < new_size) - return -ERANGE; - - /* Need to sort src XFS ACL by <ae_tag,ae_id> */ - xfs_sort(src->acl_entry, src->acl_cnt, sizeof(src->acl_entry[0]), - xfs_acl_entry_compare); - - dest->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION); - dest_entry = &dest->a_entries[0]; - src_entry = &src->acl_entry[0]; - for (n = 0; n < src->acl_cnt; n++, dest_entry++, src_entry++) { - dest_entry->e_perm = cpu_to_le16(src_entry->ae_perm); - if (_ACL_PERM_INVALID(src_entry->ae_perm)) - return -EINVAL; - dest_entry->e_tag = cpu_to_le16(src_entry->ae_tag); - switch (src_entry->ae_tag) { + ace->ae_tag = cpu_to_be32(acl_e->e_tag); + switch (acl_e->e_tag) { case ACL_USER: + ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid)); + break; case ACL_GROUP: - dest_entry->e_id = cpu_to_le32(src_entry->ae_id); - break; - case ACL_USER_OBJ: - case ACL_GROUP_OBJ: - case ACL_MASK: - case ACL_OTHER: - dest_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID); + ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid)); break; default: - return -EINVAL; - } - } - return new_size; -} - -int -xfs_acl_vget( - bhv_vnode_t *vp, - void *acl, - size_t size, - int kind) -{ - int error; - xfs_acl_t *xfs_acl = NULL; - posix_acl_xattr_header *ext_acl = acl; - int flags = 0; - - VN_HOLD(vp); - if(size) { - if (!(_ACL_ALLOC(xfs_acl))) { - error = ENOMEM; - goto out; + ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID); + break; } - memset(xfs_acl, 0, sizeof(xfs_acl_t)); - } else - flags = ATTR_KERNOVAL; - xfs_acl_get_attr(vp, xfs_acl, kind, flags, &error); - if (error) - goto out; - - if (!size) { - error = -posix_acl_xattr_size(XFS_ACL_MAX_ENTRIES); - } else { - if (xfs_acl_invalid(xfs_acl)) { - error = EINVAL; - goto out; - } - if (kind == _ACL_TYPE_ACCESS) { - bhv_vattr_t va; - - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - if (error) - goto out; - xfs_acl_sync_mode(va.va_mode, xfs_acl); - } - error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); + ace->ae_perm = cpu_to_be16(acl_e->e_perm); } -out: - VN_RELE(vp); - if(xfs_acl) - _ACL_FREE(xfs_acl); - return -error; } -int -xfs_acl_vremove( - bhv_vnode_t *vp, - int kind) +struct posix_acl * +xfs_get_acl(struct inode *inode, int type) { - int error; - - VN_HOLD(vp); - error = xfs_acl_allow_set(vp, kind); - if (!error) { - error = bhv_vop_attr_remove(vp, kind == _ACL_TYPE_DEFAULT? - SGI_ACL_DEFAULT: SGI_ACL_FILE, - ATTR_ROOT, sys_cred); - if (error == ENOATTR) - error = 0; /* 'scool */ + struct xfs_inode *ip = XFS_I(inode); + struct posix_acl *acl = NULL; + struct xfs_acl *xfs_acl; + unsigned char *ea_name; + int error; + int len; + + trace_xfs_get_acl(ip); + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + ea_name = SGI_ACL_DEFAULT; + break; + default: + BUG(); } - VN_RELE(vp); - return -error; -} -int -xfs_acl_vset( - bhv_vnode_t *vp, - void *acl, - size_t size, - int kind) -{ - posix_acl_xattr_header *ext_acl = acl; - xfs_acl_t *xfs_acl; - int error; - int basicperms = 0; /* more than std unix perms? */ - - if (!acl) - return -EINVAL; - - if (!(_ACL_ALLOC(xfs_acl))) - return -ENOMEM; + /* + * If we have a cached ACLs value just return it, not need to + * go out to the disk. + */ + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return ERR_PTR(-ENOMEM); - error = posix_acl_xattr_to_xfs(ext_acl, size, xfs_acl); + error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, + &len, ATTR_ROOT); if (error) { - _ACL_FREE(xfs_acl); - return -error; - } - if (!xfs_acl->acl_cnt) { - _ACL_FREE(xfs_acl); - return 0; + /* + * If the attribute doesn't exist make sure we have a negative + * cache entry, for any other error assume it is transient and + * leave the cache entry as ACL_NOT_CACHED. + */ + if (error == -ENOATTR) + goto out_update_cache; + goto out; } - VN_HOLD(vp); - error = xfs_acl_allow_set(vp, kind); - if (error) + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + if (IS_ERR(acl)) goto out; - /* Incoming ACL exists, set file mode based on its value */ - if (kind == _ACL_TYPE_ACCESS) - xfs_acl_setmode(vp, xfs_acl, &basicperms); - - /* - * If we have more than std unix permissions, set up the actual attr. - * Otherwise, delete any existing attr. This prevents us from - * having actual attrs for permissions that can be stored in the - * standard permission bits. - */ - if (!basicperms) { - xfs_acl_set_attr(vp, xfs_acl, kind, &error); - } else { - xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); - } - +out_update_cache: + set_cached_acl(inode, type, acl); out: - VN_RELE(vp); - _ACL_FREE(xfs_acl); - return -error; + kmem_free(xfs_acl); + return acl; } -int -xfs_acl_iaccess( - xfs_inode_t *ip, - mode_t mode, - cred_t *cr) +STATIC int +__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) { - xfs_acl_t *acl; - int rval; - - if (!(_ACL_ALLOC(acl))) - return -1; - - /* If the file has no ACL return -1. */ - rval = sizeof(xfs_acl_t); - if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE, - (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) { - _ACL_FREE(acl); - return -1; - } - xfs_acl_get_endian(acl); + struct xfs_inode *ip = XFS_I(inode); + unsigned char *ea_name; + int error; - /* If the file has an empty ACL return -1. */ - if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) { - _ACL_FREE(acl); - return -1; + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = SGI_ACL_FILE; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + ea_name = SGI_ACL_DEFAULT; + break; + default: + return -EINVAL; } - /* Synchronize ACL with mode bits */ - xfs_acl_sync_mode(ip->i_d.di_mode, acl); + if (acl) { + struct xfs_acl *xfs_acl; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - rval = xfs_acl_access(ip->i_d.di_uid, ip->i_d.di_gid, acl, mode, cr); - _ACL_FREE(acl); - return rval; -} + xfs_acl = kmem_zalloc_large(len, KM_SLEEP); + if (!xfs_acl) + return -ENOMEM; -STATIC int -xfs_acl_allow_set( - bhv_vnode_t *vp, - int kind) -{ - bhv_vattr_t va; - int error; - - if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND)) - return EPERM; - if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp)) - return ENOTDIR; - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) - return EROFS; - va.va_mask = XFS_AT_UID; - error = bhv_vop_getattr(vp, &va, 0, NULL); - if (error) - return error; - if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) - return EPERM; - return error; -} - -/* - * The access control process to determine the access permission: - * if uid == file owner id, use the file owner bits. - * if gid == file owner group id, use the file group bits. - * scan ACL for a matching user or group, and use matched entry - * permission. Use total permissions of all matching group entries, - * until all acl entries are exhausted. The final permission produced - * by matching acl entry or entries needs to be & with group permission. - * if not owner, owning group, or matching entry in ACL, use file - * other bits. - */ -STATIC int -xfs_acl_capability_check( - mode_t mode, - cred_t *cr) -{ - if ((mode & ACL_READ) && !capable_cred(cr, CAP_DAC_READ_SEARCH)) - return EACCES; - if ((mode & ACL_WRITE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) - return EACCES; - if ((mode & ACL_EXECUTE) && !capable_cred(cr, CAP_DAC_OVERRIDE)) - return EACCES; - - return 0; -} + xfs_acl_to_disk(xfs_acl, acl); -/* - * Note: cr is only used here for the capability check if the ACL test fails. - * It is not used to find out the credentials uid or groups etc, as was - * done in IRIX. It is assumed that the uid and groups for the current - * thread are taken from "current" instead of the cr parameter. - */ -STATIC int -xfs_acl_access( - uid_t fuid, - gid_t fgid, - xfs_acl_t *fap, - mode_t md, - cred_t *cr) -{ - xfs_acl_entry_t matched; - int i, allows; - int maskallows = -1; /* true, but not 1, either */ - int seen_userobj = 0; + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); - matched.ae_tag = 0; /* Invalid type */ - matched.ae_perm = 0; - md >>= 6; /* Normalize the bits for comparison */ + error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, + len, ATTR_ROOT); - for (i = 0; i < fap->acl_cnt; i++) { + kmem_free(xfs_acl); + } else { /* - * Break out if we've got a user_obj entry or - * a user entry and the mask (and have processed USER_OBJ) + * A NULL ACL argument means we want to remove the ACL. */ - if (matched.ae_tag == ACL_USER_OBJ) - break; - if (matched.ae_tag == ACL_USER) { - if (maskallows != -1 && seen_userobj) - break; - if (fap->acl_entry[i].ae_tag != ACL_MASK && - fap->acl_entry[i].ae_tag != ACL_USER_OBJ) - continue; - } - /* True if this entry allows the requested access */ - allows = ((fap->acl_entry[i].ae_perm & md) == md); + error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT); - switch (fap->acl_entry[i].ae_tag) { - case ACL_USER_OBJ: - seen_userobj = 1; - if (fuid != current->fsuid) - continue; - matched.ae_tag = ACL_USER_OBJ; - matched.ae_perm = allows; - break; - case ACL_USER: - if (fap->acl_entry[i].ae_id != current->fsuid) - continue; - matched.ae_tag = ACL_USER; - matched.ae_perm = allows; - break; - case ACL_GROUP_OBJ: - if ((matched.ae_tag == ACL_GROUP_OBJ || - matched.ae_tag == ACL_GROUP) && !allows) - continue; - if (!in_group_p(fgid)) - continue; - matched.ae_tag = ACL_GROUP_OBJ; - matched.ae_perm = allows; - break; - case ACL_GROUP: - if ((matched.ae_tag == ACL_GROUP_OBJ || - matched.ae_tag == ACL_GROUP) && !allows) - continue; - if (!in_group_p(fap->acl_entry[i].ae_id)) - continue; - matched.ae_tag = ACL_GROUP; - matched.ae_perm = allows; - break; - case ACL_MASK: - maskallows = allows; - break; - case ACL_OTHER: - if (matched.ae_tag != 0) - continue; - matched.ae_tag = ACL_OTHER; - matched.ae_perm = allows; - break; - } - } - /* - * First possibility is that no matched entry allows access. - * The capability to override DAC may exist, so check for it. - */ - switch (matched.ae_tag) { - case ACL_OTHER: - case ACL_USER_OBJ: - if (matched.ae_perm) - return 0; - break; - case ACL_USER: - case ACL_GROUP_OBJ: - case ACL_GROUP: - if (maskallows && matched.ae_perm) - return 0; - break; - case 0: - break; + /* + * If the attribute didn't exist to start with that's fine. + */ + if (error == -ENOATTR) + error = 0; } - return xfs_acl_capability_check(md, cr); + if (!error) + set_cached_acl(inode, type, acl); + return error; } -/* - * ACL validity checker. - * This acl validation routine checks each ACL entry read in makes sense. - */ -STATIC int -xfs_acl_invalid( - xfs_acl_t *aclp) +static int +xfs_set_mode(struct inode *inode, umode_t mode) { - xfs_acl_entry_t *entry, *e; - int user = 0, group = 0, other = 0, mask = 0; - int mask_required = 0; - int i, j; + int error = 0; - if (!aclp) - goto acl_invalid; + if (mode != inode->i_mode) { + struct iattr iattr; - if (aclp->acl_cnt > XFS_ACL_MAX_ENTRIES) - goto acl_invalid; + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; + iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); - for (i = 0; i < aclp->acl_cnt; i++) { - entry = &aclp->acl_entry[i]; - switch (entry->ae_tag) { - case ACL_USER_OBJ: - if (user++) - goto acl_invalid; - break; - case ACL_GROUP_OBJ: - if (group++) - goto acl_invalid; - break; - case ACL_OTHER: - if (other++) - goto acl_invalid; - break; - case ACL_USER: - case ACL_GROUP: - for (j = i + 1; j < aclp->acl_cnt; j++) { - e = &aclp->acl_entry[j]; - if (e->ae_id == entry->ae_id && - e->ae_tag == entry->ae_tag) - goto acl_invalid; - } - mask_required++; - break; - case ACL_MASK: - if (mask++) - goto acl_invalid; - break; - default: - goto acl_invalid; - } + error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } - if (!user || !group || !other || (mask_required && !mask)) - goto acl_invalid; - else - return 0; -acl_invalid: - return EINVAL; -} - -/* - * Do ACL endian conversion. - */ -STATIC void -xfs_acl_get_endian( - xfs_acl_t *aclp) -{ - xfs_acl_entry_t *ace, *end; - - INT_SET(aclp->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); - end = &aclp->acl_entry[0]+aclp->acl_cnt; - for (ace = &aclp->acl_entry[0]; ace < end; ace++) { - INT_SET(ace->ae_tag, ARCH_CONVERT, ace->ae_tag); - INT_SET(ace->ae_id, ARCH_CONVERT, ace->ae_id); - INT_SET(ace->ae_perm, ARCH_CONVERT, ace->ae_perm); - } -} -/* - * Get the ACL from the EA and do endian conversion. - */ -STATIC void -xfs_acl_get_attr( - bhv_vnode_t *vp, - xfs_acl_t *aclp, - int kind, - int flags, - int *error) -{ - int len = sizeof(xfs_acl_t); - - ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1); - flags |= ATTR_ROOT; - *error = bhv_vop_attr_get(vp, kind == _ACL_TYPE_ACCESS ? - SGI_ACL_FILE : SGI_ACL_DEFAULT, - (char *)aclp, &len, flags, sys_cred); - if (*error || (flags & ATTR_KERNOVAL)) - return; - xfs_acl_get_endian(aclp); + return error; } -/* - * Set the EA with the ACL and do endian conversion. - */ -STATIC void -xfs_acl_set_attr( - bhv_vnode_t *vp, - xfs_acl_t *aclp, - int kind, - int *error) +static int +xfs_acl_exists(struct inode *inode, unsigned char *name) { - xfs_acl_entry_t *ace, *newace, *end; - xfs_acl_t *newacl; - int len; - - if (!(_ACL_ALLOC(newacl))) { - *error = ENOMEM; - return; - } + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); - len = sizeof(xfs_acl_t) - - (sizeof(xfs_acl_entry_t) * (XFS_ACL_MAX_ENTRIES - aclp->acl_cnt)); - end = &aclp->acl_entry[0]+aclp->acl_cnt; - for (ace = &aclp->acl_entry[0], newace = &newacl->acl_entry[0]; - ace < end; - ace++, newace++) { - INT_SET(newace->ae_tag, ARCH_CONVERT, ace->ae_tag); - INT_SET(newace->ae_id, ARCH_CONVERT, ace->ae_id); - INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm); - } - INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt); - *error = bhv_vop_attr_set(vp, kind == _ACL_TYPE_ACCESS ? - SGI_ACL_FILE: SGI_ACL_DEFAULT, - (char *)newacl, len, ATTR_ROOT, sys_cred); - _ACL_FREE(newacl); + return (xfs_attr_get(XFS_I(inode), name, NULL, &len, + ATTR_ROOT|ATTR_KERNOVAL) == 0); } int -xfs_acl_vtoacl( - bhv_vnode_t *vp, - xfs_acl_t *access_acl, - xfs_acl_t *default_acl) +posix_acl_access_exists(struct inode *inode) { - bhv_vattr_t va; - int error = 0; - - if (access_acl) { - /* - * Get the Access ACL and the mode. If either cannot - * be obtained for some reason, invalidate the access ACL. - */ - xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error); - if (!error) { - /* Got the ACL, need the mode... */ - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - } - - if (error) - access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; - else /* We have a good ACL and the file mode, synchronize. */ - xfs_acl_sync_mode(va.va_mode, access_acl); - } - - if (default_acl) { - xfs_acl_get_attr(vp, default_acl, _ACL_TYPE_DEFAULT, 0, &error); - if (error) - default_acl->acl_cnt = XFS_ACL_NOT_PRESENT; - } - return error; + return xfs_acl_exists(inode, SGI_ACL_FILE); } -/* - * This function retrieves the parent directory's acl, processes it - * and lets the child inherit the acl(s) that it should. - */ int -xfs_acl_inherit( - bhv_vnode_t *vp, - bhv_vattr_t *vap, - xfs_acl_t *pdaclp) +posix_acl_default_exists(struct inode *inode) { - xfs_acl_t *cacl; - int error = 0; - int basicperms = 0; - - /* - * If the parent does not have a default ACL, or it's an - * invalid ACL, we're done. - */ - if (!vp) + if (!S_ISDIR(inode->i_mode)) return 0; - if (!pdaclp || xfs_acl_invalid(pdaclp)) - return 0; - - /* - * Copy the default ACL of the containing directory to - * the access ACL of the new file and use the mode that - * was passed in to set up the correct initial values for - * the u::,g::[m::], and o:: entries. This is what makes - * umask() "work" with ACL's. - */ - - if (!(_ACL_ALLOC(cacl))) - return ENOMEM; - - memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); - xfs_acl_filter_mode(vap->va_mode, cacl); - xfs_acl_setmode(vp, cacl, &basicperms); - - /* - * Set the Default and Access ACL on the file. The mode is already - * set on the file, so we don't need to worry about that. - * - * If the new file is a directory, its default ACL is a copy of - * the containing directory's default ACL. - */ - if (VN_ISDIR(vp)) - xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); - if (!error && !basicperms) - xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); - _ACL_FREE(cacl); - return error; + return xfs_acl_exists(inode, SGI_ACL_DEFAULT); } -/* - * Set up the correct mode on the file based on the supplied ACL. This - * makes sure that the mode on the file reflects the state of the - * u::,g::[m::], and o:: entries in the ACL. Since the mode is where - * the ACL is going to get the permissions for these entries, we must - * synchronize the mode whenever we set the ACL on a file. - */ -STATIC int -xfs_acl_setmode( - bhv_vnode_t *vp, - xfs_acl_t *acl, - int *basicperms) +int +xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - bhv_vattr_t va; - xfs_acl_entry_t *ap; - xfs_acl_entry_t *gap = NULL; - int i, error, nomask = 1; - - *basicperms = 1; + int error = 0; - if (acl->acl_cnt == XFS_ACL_NOT_PRESENT) - return 0; + if (!acl) + goto set_acl; - /* - * Copy the u::, g::, o::, and m:: bits from the ACL into the - * mode. The m:: bits take precedence over the g:: bits. - */ - va.va_mask = XFS_AT_MODE; - error = bhv_vop_getattr(vp, &va, 0, sys_cred); - if (error) + error = -E2BIG; + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) return error; - va.va_mask = XFS_AT_MODE; - va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); - ap = acl->acl_entry; - for (i = 0; i < acl->acl_cnt; ++i) { - switch (ap->ae_tag) { - case ACL_USER_OBJ: - va.va_mode |= ap->ae_perm << 6; - break; - case ACL_GROUP_OBJ: - gap = ap; - break; - case ACL_MASK: /* more than just standard modes */ - nomask = 0; - va.va_mode |= ap->ae_perm << 3; - *basicperms = 0; - break; - case ACL_OTHER: - va.va_mode |= ap->ae_perm; - break; - default: /* more than just standard modes */ - *basicperms = 0; - break; - } - ap++; - } + if (type == ACL_TYPE_ACCESS) { + umode_t mode = inode->i_mode; + error = posix_acl_equiv_mode(acl, &mode); - /* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */ - if (gap && nomask) - va.va_mode |= gap->ae_perm << 3; - - return bhv_vop_setattr(vp, &va, 0, sys_cred); -} + if (error <= 0) { + acl = NULL; -/* - * The permissions for the special ACL entries (u::, g::[m::], o::) are - * actually stored in the file mode (if there is both a group and a mask, - * the group is stored in the ACL entry and the mask is stored on the file). - * This allows the mode to remain automatically in sync with the ACL without - * the need for a call-back to the ACL system at every point where the mode - * could change. This function takes the permissions from the specified mode - * and places it in the supplied ACL. - * - * This implementation draws its validity from the fact that, when the ACL - * was assigned, the mode was copied from the ACL. - * If the mode did not change, therefore, the mode remains exactly what was - * taken from the special ACL entries at assignment. - * If a subsequent chmod() was done, the POSIX spec says that the change in - * mode must cause an update to the ACL seen at user level and used for - * access checks. Before and after a mode change, therefore, the file mode - * most accurately reflects what the special ACL entries should permit/deny. - * - * CAVEAT: If someone sets the SGI_ACL_FILE attribute directly, - * the existing mode bits will override whatever is in the - * ACL. Similarly, if there is a pre-existing ACL that was - * never in sync with its mode (owing to a bug in 6.5 and - * before), it will now magically (or mystically) be - * synchronized. This could cause slight astonishment, but - * it is better than inconsistent permissions. - * - * The supplied ACL is a template that may contain any combination - * of special entries. These are treated as place holders when we fill - * out the ACL. This routine does not add or remove special entries, it - * simply unites each special entry with its associated set of permissions. - */ -STATIC void -xfs_acl_sync_mode( - mode_t mode, - xfs_acl_t *acl) -{ - int i, nomask = 1; - xfs_acl_entry_t *ap; - xfs_acl_entry_t *gap = NULL; - - /* - * Set ACL entries. POSIX1003.1eD16 requires that the MASK - * be set instead of the GROUP entry, if there is a MASK. - */ - for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { - switch (ap->ae_tag) { - case ACL_USER_OBJ: - ap->ae_perm = (mode >> 6) & 0x7; - break; - case ACL_GROUP_OBJ: - gap = ap; - break; - case ACL_MASK: - nomask = 0; - ap->ae_perm = (mode >> 3) & 0x7; - break; - case ACL_OTHER: - ap->ae_perm = mode & 0x7; - break; - default: - break; + if (error < 0) + return error; } - } - /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ - if (gap && nomask) - gap->ae_perm = (mode >> 3) & 0x7; -} - -/* - * When inheriting an Access ACL from a directory Default ACL, - * the ACL bits are set to the intersection of the ACL default - * permission bits and the file permission bits in mode. If there - * are no permission bits on the file then we must not give them - * the ACL. This is what what makes umask() work with ACLs. - */ -STATIC void -xfs_acl_filter_mode( - mode_t mode, - xfs_acl_t *acl) -{ - int i, nomask = 1; - xfs_acl_entry_t *ap; - xfs_acl_entry_t *gap = NULL; - /* - * Set ACL entries. POSIX1003.1eD16 requires that the MASK - * be merged with GROUP entry, if there is a MASK. - */ - for (ap = acl->acl_entry, i = 0; i < acl->acl_cnt; ap++, i++) { - switch (ap->ae_tag) { - case ACL_USER_OBJ: - ap->ae_perm &= (mode >> 6) & 0x7; - break; - case ACL_GROUP_OBJ: - gap = ap; - break; - case ACL_MASK: - nomask = 0; - ap->ae_perm &= (mode >> 3) & 0x7; - break; - case ACL_OTHER: - ap->ae_perm &= mode & 0x7; - break; - default: - break; - } + error = xfs_set_mode(inode, mode); + if (error) + return error; } - /* Set the ACL_GROUP_OBJ if there's no ACL_MASK */ - if (gap && nomask) - gap->ae_perm &= (mode >> 3) & 0x7; + + set_acl: + return __xfs_set_acl(inode, type, acl); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index f853cf1a627..5dc16374451 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -18,85 +18,58 @@ #ifndef __XFS_ACL_H__ #define __XFS_ACL_H__ -/* - * Access Control Lists - */ -typedef __uint16_t xfs_acl_perm_t; -typedef __int32_t xfs_acl_type_t; -typedef __int32_t xfs_acl_tag_t; -typedef __int32_t xfs_acl_id_t; +struct inode; +struct posix_acl; +struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) -typedef struct xfs_acl_entry { - xfs_acl_tag_t ae_tag; - xfs_acl_id_t ae_id; - xfs_acl_perm_t ae_perm; -} xfs_acl_entry_t; +/* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; -typedef struct xfs_acl { - __int32_t acl_cnt; - xfs_acl_entry_t acl_entry[XFS_ACL_MAX_ENTRIES]; -} xfs_acl_t; +struct xfs_acl { + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; +}; + +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE "SGI_ACL_FILE" -#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" +#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" +#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) - #ifdef CONFIG_XFS_POSIX_ACL - -struct vattr; -struct bhv_vnode; -struct xfs_inode; - -extern struct kmem_zone *xfs_acl_zone; -#define xfs_acl_zone_init(zone, name) \ - (zone) = kmem_zone_init(sizeof(xfs_acl_t), (name)) -#define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone) - -extern int xfs_acl_inherit(struct bhv_vnode *, struct bhv_vattr *, xfs_acl_t *); -extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *); -extern int xfs_acl_vtoacl(struct bhv_vnode *, xfs_acl_t *, xfs_acl_t *); -extern int xfs_acl_vhasacl_access(struct bhv_vnode *); -extern int xfs_acl_vhasacl_default(struct bhv_vnode *); -extern int xfs_acl_vset(struct bhv_vnode *, void *, size_t, int); -extern int xfs_acl_vget(struct bhv_vnode *, void *, size_t, int); -extern int xfs_acl_vremove(struct bhv_vnode *, int); - -#define _ACL_TYPE_ACCESS 1 -#define _ACL_TYPE_DEFAULT 2 -#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE)) - -#define _ACL_INHERIT(c,v,d) (xfs_acl_inherit(c,v,d)) -#define _ACL_GET_ACCESS(pv,pa) (xfs_acl_vtoacl(pv,pa,NULL) == 0) -#define _ACL_GET_DEFAULT(pv,pd) (xfs_acl_vtoacl(pv,NULL,pd) == 0) -#define _ACL_ACCESS_EXISTS xfs_acl_vhasacl_access -#define _ACL_DEFAULT_EXISTS xfs_acl_vhasacl_default -#define _ACL_XFS_IACCESS(i,m,c) (XFS_IFORK_Q(i) ? xfs_acl_iaccess(i,m,c) : -1) - -#define _ACL_ALLOC(a) ((a) = kmem_zone_alloc(xfs_acl_zone, KM_SLEEP)) -#define _ACL_FREE(a) ((a)? kmem_zone_free(xfs_acl_zone, (a)):(void)0) - +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int posix_acl_access_exists(struct inode *inode); +extern int posix_acl_default_exists(struct inode *inode); #else -#define xfs_acl_zone_init(zone,name) -#define xfs_acl_zone_destroy(zone) -#define xfs_acl_vset(v,p,sz,t) (-EOPNOTSUPP) -#define xfs_acl_vget(v,p,sz,t) (-EOPNOTSUPP) -#define xfs_acl_vremove(v,t) (-EOPNOTSUPP) -#define xfs_acl_vhasacl_access(v) (0) -#define xfs_acl_vhasacl_default(v) (0) -#define _ACL_ALLOC(a) (1) /* successfully allocate nothing */ -#define _ACL_FREE(a) ((void)0) -#define _ACL_INHERIT(c,v,d) (0) -#define _ACL_GET_ACCESS(pv,pa) (0) -#define _ACL_GET_DEFAULT(pv,pd) (0) -#define _ACL_ACCESS_EXISTS (NULL) -#define _ACL_DEFAULT_EXISTS (NULL) -#define _ACL_XFS_IACCESS(i,m,c) (-1) -#endif - +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +{ + return NULL; +} +# define xfs_set_acl NULL +# define posix_acl_access_exists(inode) 0 +# define posix_acl_default_exists(inode) 0 +#endif /* CONFIG_XFS_POSIX_ACL */ #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index dc2361dd740..6e247a99f5d 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -30,6 +30,7 @@ struct xfs_trans; #define XFS_AGF_MAGIC 0x58414746 /* 'XAGF' */ #define XFS_AGI_MAGIC 0x58414749 /* 'XAGI' */ +#define XFS_AGFL_MAGIC 0x5841464c /* 'XAFL' */ #define XFS_AGF_VERSION 1 #define XFS_AGI_VERSION 1 @@ -63,13 +64,33 @@ typedef struct xfs_agf { __be32 agf_spare0; /* spare field */ __be32 agf_levels[XFS_BTNUM_AGF]; /* btree levels */ __be32 agf_spare1; /* spare field */ + __be32 agf_flfirst; /* first freelist block's index */ __be32 agf_fllast; /* last freelist block's index */ __be32 agf_flcount; /* count of blocks in freelist */ __be32 agf_freeblks; /* total free blocks */ + __be32 agf_longest; /* longest free space */ + __be32 agf_btreeblks; /* # of blocks held in AGF btrees */ + uuid_t agf_uuid; /* uuid of filesystem */ + + /* + * reserve some contiguous space for future logged fields before we add + * the unlogged fields. This makes the range logging via flags and + * structure offsets much simpler. + */ + __be64 agf_spare64[16]; + + /* unlogged fields, written during buffer writeback. */ + __be64 agf_lsn; /* last write sequence */ + __be32 agf_crc; /* crc of agf sector */ + __be32 agf_spare2; + + /* structure must be padded to 64 bit alignment */ } xfs_agf_t; +#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc) + #define XFS_AGF_MAGICNUM 0x00000001 #define XFS_AGF_VERSIONNUM 0x00000002 #define XFS_AGF_SEQNO 0x00000004 @@ -81,14 +102,33 @@ typedef struct xfs_agf { #define XFS_AGF_FLCOUNT 0x00000100 #define XFS_AGF_FREEBLKS 0x00000200 #define XFS_AGF_LONGEST 0x00000400 -#define XFS_AGF_NUM_BITS 11 +#define XFS_AGF_BTREEBLKS 0x00000800 +#define XFS_AGF_UUID 0x00001000 +#define XFS_AGF_NUM_BITS 13 #define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1) +#define XFS_AGF_FLAGS \ + { XFS_AGF_MAGICNUM, "MAGICNUM" }, \ + { XFS_AGF_VERSIONNUM, "VERSIONNUM" }, \ + { XFS_AGF_SEQNO, "SEQNO" }, \ + { XFS_AGF_LENGTH, "LENGTH" }, \ + { XFS_AGF_ROOTS, "ROOTS" }, \ + { XFS_AGF_LEVELS, "LEVELS" }, \ + { XFS_AGF_FLFIRST, "FLFIRST" }, \ + { XFS_AGF_FLLAST, "FLLAST" }, \ + { XFS_AGF_FLCOUNT, "FLCOUNT" }, \ + { XFS_AGF_FREEBLKS, "FREEBLKS" }, \ + { XFS_AGF_LONGEST, "LONGEST" }, \ + { XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \ + { XFS_AGF_UUID, "UUID" } + /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log)) #define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp)) -#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)((bp)->b_addr)) +extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); /* * Size of the unlinked inode hash table in the agi. @@ -112,6 +152,7 @@ typedef struct xfs_agi { __be32 agi_root; /* root of inode btree */ __be32 agi_level; /* levels in inode btree */ __be32 agi_freecount; /* number of free inodes */ + __be32 agi_newino; /* new inode just allocated */ __be32 agi_dirino; /* last directory inode chunk */ /* @@ -119,26 +160,46 @@ typedef struct xfs_agi { * still being referenced. */ __be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS]; + /* + * This marks the end of logging region 1 and start of logging region 2. + */ + uuid_t agi_uuid; /* uuid of filesystem */ + __be32 agi_crc; /* crc of agi sector */ + __be32 agi_pad32; + __be64 agi_lsn; /* last write sequence */ + + __be32 agi_free_root; /* root of the free inode btree */ + __be32 agi_free_level;/* levels in free inode btree */ + + /* structure must be padded to 64 bit alignment */ } xfs_agi_t; -#define XFS_AGI_MAGICNUM 0x00000001 -#define XFS_AGI_VERSIONNUM 0x00000002 -#define XFS_AGI_SEQNO 0x00000004 -#define XFS_AGI_LENGTH 0x00000008 -#define XFS_AGI_COUNT 0x00000010 -#define XFS_AGI_ROOT 0x00000020 -#define XFS_AGI_LEVEL 0x00000040 -#define XFS_AGI_FREECOUNT 0x00000080 -#define XFS_AGI_NEWINO 0x00000100 -#define XFS_AGI_DIRINO 0x00000200 -#define XFS_AGI_UNLINKED 0x00000400 -#define XFS_AGI_NUM_BITS 11 -#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1) +#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc) + +#define XFS_AGI_MAGICNUM (1 << 0) +#define XFS_AGI_VERSIONNUM (1 << 1) +#define XFS_AGI_SEQNO (1 << 2) +#define XFS_AGI_LENGTH (1 << 3) +#define XFS_AGI_COUNT (1 << 4) +#define XFS_AGI_ROOT (1 << 5) +#define XFS_AGI_LEVEL (1 << 6) +#define XFS_AGI_FREECOUNT (1 << 7) +#define XFS_AGI_NEWINO (1 << 8) +#define XFS_AGI_DIRINO (1 << 9) +#define XFS_AGI_UNLINKED (1 << 10) +#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */ +#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1) +#define XFS_AGI_FREE_ROOT (1 << 11) +#define XFS_AGI_FREE_LEVEL (1 << 12) +#define XFS_AGI_NUM_BITS_R2 13 /* disk block (xfs_daddr_t) in the AG */ #define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log)) #define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp)) -#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)((bp)->b_addr)) + +extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_agnumber_t agno, struct xfs_buf **bpp); /* * The third a.g. block contains the a.g. freelist, an array @@ -146,53 +207,42 @@ typedef struct xfs_agi { */ #define XFS_AGFL_DADDR(mp) ((xfs_daddr_t)(3 << (mp)->m_sectbb_log)) #define XFS_AGFL_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGFL_DADDR(mp)) -#define XFS_AGFL_SIZE(mp) ((mp)->m_sb.sb_sectsize / sizeof(xfs_agblock_t)) -#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_AGFL(bp) ((xfs_agfl_t *)((bp)->b_addr)) -typedef struct xfs_agfl { - xfs_agblock_t agfl_bno[1]; /* actually XFS_AGFL_SIZE(mp) */ -} xfs_agfl_t; +#define XFS_BUF_TO_AGFL_BNO(mp, bp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + &(XFS_BUF_TO_AGFL(bp)->agfl_bno[0]) : \ + (__be32 *)(bp)->b_addr) /* - * Busy block/extent entry. Used in perag to mark blocks that have been freed - * but whose transactions aren't committed to disk yet. + * Size of the AGFL. For CRC-enabled filesystes we steal a couple of + * slots in the beginning of the block for a proper header with the + * location information and CRC. */ -typedef struct xfs_perag_busy { - xfs_agblock_t busy_start; - xfs_extlen_t busy_length; - struct xfs_trans *busy_tp; /* transaction that did the free */ -} xfs_perag_busy_t; +#define XFS_AGFL_SIZE(mp) \ + (((mp)->m_sb.sb_sectsize - \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + sizeof(struct xfs_agfl) : 0)) / \ + sizeof(xfs_agblock_t)) + +typedef struct xfs_agfl { + __be32 agfl_magicnum; + __be32 agfl_seqno; + uuid_t agfl_uuid; + __be64 agfl_lsn; + __be32 agfl_crc; + __be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */ +} xfs_agfl_t; + +#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) /* - * Per-ag incore structure, copies of information in agf and agi, - * to improve the performance of allocation group selection. - * - * pick sizes which fit in allocation buckets well + * tags for inode radix tree */ -#if (BITS_PER_LONG == 32) -#define XFS_PAGB_NUM_SLOTS 84 -#elif (BITS_PER_LONG == 64) -#define XFS_PAGB_NUM_SLOTS 128 -#endif - -typedef struct xfs_perag -{ - char pagf_init; /* this agf's entry is initialized */ - char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is preferred to be metadata */ - char pagi_inodeok; /* The agi is ok for inodes */ - __uint8_t pagf_levels[XFS_BTNUM_AGF]; - /* # of levels in bno & cnt btree */ - __uint32_t pagf_flcount; /* count of blocks in freelist */ - xfs_extlen_t pagf_freeblks; /* total free blocks */ - xfs_extlen_t pagf_longest; /* longest free space */ - xfs_agino_t pagi_freecount; /* number of free inodes */ -#ifdef __KERNEL__ - lock_t pagb_lock; /* lock for pagb_list */ -#endif - int pagb_count; /* pagb slots in use */ - xfs_perag_busy_t *pagb_list; /* unstable blocks */ -} xfs_perag_t; +#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup + in xfs_inode_ag_iterator */ +#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */ +#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */ #define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) #define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ @@ -203,15 +253,15 @@ typedef struct xfs_perag be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) #define XFS_MIN_FREELIST_PAG(pag,mp) \ (XFS_MIN_FREELIST_RAW( \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (uint_t)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ + (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ ((xfs_agnumber_t)((fsbno) >> (mp)->m_sb.sb_agblklog)) #define XFS_FSB_TO_AGBNO(mp,fsbno) \ - ((xfs_agblock_t)((fsbno) & XFS_MASK32LO((mp)->m_sb.sb_agblklog))) + ((xfs_agblock_t)((fsbno) & xfs_mask32lo((mp)->m_sb.sb_agblklog))) #define XFS_AGB_TO_DADDR(mp,agno,agbno) \ ((xfs_daddr_t)XFS_FSB_TO_BB(mp, \ (xfs_fsblock_t)(agno) * (mp)->m_sb.sb_agblocks + (agbno))) @@ -224,8 +274,8 @@ typedef struct xfs_perag #define XFS_AG_CHECK_DADDR(mp,d,len) \ ((len) == 1 ? \ ASSERT((d) == XFS_SB_DADDR || \ - XFS_DADDR_TO_AGBNO(mp, d) != XFS_SB_DADDR) : \ - ASSERT(XFS_DADDR_TO_AGNO(mp, d) == \ - XFS_DADDR_TO_AGNO(mp, (d) + (len) - 1))) + xfs_daddr_to_agbno(mp, d) != XFS_SB_DADDR) : \ + ASSERT(xfs_daddr_to_agno(mp, d) == \ + xfs_daddr_to_agno(mp, (d) + (len) - 1))) #endif /* __XFS_AG_H__ */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index d2bbcd882a6..d43813267a8 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -17,106 +17,153 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +struct workqueue_struct *xfs_alloc_wq; #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC int -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); - -#if defined(XFS_ALLOC_TRACE) -ktrace_t *xfs_alloc_trace_buf; - -#define TRACE_ALLOC(s,a) \ - xfs_alloc_trace_alloc(fname, s, a, __LINE__) -#define TRACE_FREE(s,a,b,x,f) \ - xfs_alloc_trace_free(fname, s, mp, a, b, x, f, __LINE__) -#define TRACE_MODAGF(s,a,f) \ - xfs_alloc_trace_modagf(fname, s, mp, a, f, __LINE__) -#define TRACE_BUSY(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) -#define TRACE_UNBUSY(fname,s,ag,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) \ - xfs_alloc_trace_busy(fname, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) -#else -#define TRACE_ALLOC(s,a) -#define TRACE_FREE(s,a,b,x,f) -#define TRACE_MODAGF(s,a,f) -#define TRACE_BUSY(s,a,ag,agb,l,sl,tp) -#define TRACE_UNBUSY(fname,s,ag,sl,tp) -#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) -#endif /* XFS_ALLOC_TRACE */ - -/* - * Prototypes for per-ag allocation routines - */ - STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); + +/* + * Lookup the record equal to [bno, len] in the btree given by cur. + */ +STATIC int /* error */ +xfs_alloc_lookup_eq( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +/* + * Lookup the first record greater than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} + +/* + * Lookup the first record less than or equal to [bno, len] + * in the btree given by cur. + */ +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat) /* success/failure */ +{ + cur->bc_rec.a.ar_startblock = bno; + cur->bc_rec.a.ar_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); +} + +/* + * Update the record referred to by cur to the value given + * by [bno, len]. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_alloc_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len) /* length of extent */ +{ + union xfs_btree_rec rec; + + rec.alloc.ar_startblock = cpu_to_be32(bno); + rec.alloc.ar_blockcount = cpu_to_be32(len); + return xfs_btree_update(cur, &rec); +} /* - * Internal functions. + * Get the data from the pointed-to record. */ +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + *bno = be32_to_cpu(rec->alloc.ar_startblock); + *len = be32_to_cpu(rec->alloc.ar_blockcount); + } + return error; +} /* * Compute aligned version of the found extent. * Takes alignment and min length into account. */ -STATIC int /* success (>= minlen) */ +STATIC void xfs_alloc_compute_aligned( + xfs_alloc_arg_t *args, /* allocation argument structure */ xfs_agblock_t foundbno, /* starting block in found extent */ xfs_extlen_t foundlen, /* length in found extent */ - xfs_extlen_t alignment, /* alignment for allocation */ - xfs_extlen_t minlen, /* minimum length for allocation */ xfs_agblock_t *resbno, /* result block number */ xfs_extlen_t *reslen) /* result length */ { xfs_agblock_t bno; - xfs_extlen_t diff; xfs_extlen_t len; - if (alignment > 1 && foundlen >= minlen) { - bno = roundup(foundbno, alignment); - diff = bno - foundbno; - len = diff >= foundlen ? 0 : foundlen - diff; + /* Trim busy sections out of found extent */ + xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; } else { - bno = foundbno; - len = foundlen; + *resbno = bno; + *reslen = len; } - *resbno = bno; - *reslen = len; - return len >= minlen; } /* @@ -128,6 +175,7 @@ xfs_alloc_compute_diff( xfs_agblock_t wantbno, /* target starting block */ xfs_extlen_t wantlen, /* target length */ xfs_extlen_t alignment, /* target alignment */ + char userdata, /* are we allocating data? */ xfs_agblock_t freebno, /* freespace's starting block */ xfs_extlen_t freelen, /* freespace's length */ xfs_agblock_t *newbnop) /* result: best start block from free */ @@ -142,7 +190,14 @@ xfs_alloc_compute_diff( ASSERT(freelen >= wantlen); freeend = freebno + freelen; wantend = wantbno + wantlen; - if (freebno >= wantbno) { + /* + * We want to allocate from the start of a free extent if it is past + * the desired block or if we are allocating user data and the free + * extent is before desired block. The second case is there to allow + * for contiguous allocation from the remaining free space if the file + * grows in the short term. + */ + if (freebno >= wantbno || (userdata && freeend < wantend)) { if ((newbno1 = roundup(freebno, alignment)) >= freeend) newbno1 = NULLAGBLOCK; } else if (freeend >= wantend && alignment > 1) { @@ -202,16 +257,14 @@ xfs_alloc_fix_len( k = rlen % args->prod; if (k == args->mod) return; - if (k > args->mod) { - if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen) - return; - } else { - if ((int)(rlen = rlen - args->prod - (args->mod - k)) < - (int)args->minlen) - return; - } - ASSERT(rlen >= args->minlen); - ASSERT(rlen <= args->maxlen); + if (k > args->mod) + rlen = rlen - (k - args->mod); + else + rlen = rlen - args->prod + (args->mod - k); + if ((int)rlen < (int)args->minlen) + return; + ASSERT(rlen >= args->minlen && rlen <= args->maxlen); + ASSERT(rlen % args->prod == args->mod); args->len = rlen; } @@ -230,7 +283,6 @@ xfs_alloc_fix_minleft( return 1; agf = XFS_BUF_TO_AGF(args->agbp); diff = be32_to_cpu(agf->agf_freeblks) - + be32_to_cpu(agf->agf_flcount) - args->len - args->minleft; if (diff >= 0) return 1; @@ -295,21 +347,20 @@ xfs_alloc_fixup_trees( return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } + #ifdef DEBUG - { - xfs_alloc_block_t *bnoblock; - xfs_alloc_block_t *cntblock; - - if (bno_cur->bc_nlevels == 1 && - cnt_cur->bc_nlevels == 1) { - bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]); - cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( - be16_to_cpu(bnoblock->bb_numrecs) == - be16_to_cpu(cntblock->bb_numrecs)); - } + if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { + struct xfs_btree_block *bnoblock; + struct xfs_btree_block *cntblock; + + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + + XFS_WANT_CORRUPTED_RETURN( + bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif + /* * Deal with all four cases: the allocated record is contained * within the freespace record, so we can have new freespace @@ -334,7 +385,7 @@ xfs_alloc_fixup_trees( /* * Delete the entry from the by-size btree. */ - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); /* @@ -344,7 +395,7 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } @@ -352,7 +403,7 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } @@ -363,7 +414,7 @@ xfs_alloc_fixup_trees( /* * No remaining freespace, just delete the by-block tree entry. */ - if ((error = xfs_alloc_delete(bno_cur, &i))) + if ((error = xfs_btree_delete(bno_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } else { @@ -380,13 +431,94 @@ xfs_alloc_fixup_trees( if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 0); - if ((error = xfs_alloc_insert(bno_cur, &i))) + if ((error = xfs_btree_insert(bno_cur, &i))) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); } return 0; } +static bool +xfs_agfl_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + int i; + + if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be32_to_cpu(agfl->agfl_magicnum) != XFS_AGFL_MAGIC) + return false; + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) + return false; + + for (i = 0; i < XFS_AGFL_SIZE(mp); i++) { + if (be32_to_cpu(agfl->agfl_bno[i]) != NULLAGBLOCK && + be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) + return false; + } + return true; +} + +static void +xfs_agfl_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* + * There is no verification of non-crc AGFLs because mkfs does not + * initialise the AGFL to zero or NULL. Hence the only valid part of the + * AGFL is what the AGF says is active. We can't get to the AGF, so we + * can't verify just those entries are valid. + */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_agfl_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agfl_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc AGFLs */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_agfl_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) + XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agfl_buf_ops = { + .verify_read = xfs_agfl_read_verify, + .verify_write = xfs_agfl_write_verify, +}; + /* * Read in the allocation group free block array. */ @@ -404,133 +536,34 @@ xfs_alloc_read_agfl( error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); if (error) return error; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGFL, XFS_AGFL_REF); + xfs_buf_set_ref(bp, XFS_AGFL_REF); *bpp = bp; return 0; } -#if defined(XFS_ALLOC_TRACE) -/* - * Add an allocation trace entry for an alloc call. - */ -STATIC void -xfs_alloc_trace_alloc( - char *name, /* function tag string */ - char *str, /* additional string */ - xfs_alloc_arg_t *args, /* allocation argument structure */ - int line) /* source line number */ +STATIC int +xfs_alloc_update_counters( + struct xfs_trans *tp, + struct xfs_perag *pag, + struct xfs_buf *agbp, + long len) { - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_ALLOC | (line << 16)), - (void *)name, - (void *)str, - (void *)args->mp, - (void *)(__psunsigned_t)args->agno, - (void *)(__psunsigned_t)args->agbno, - (void *)(__psunsigned_t)args->minlen, - (void *)(__psunsigned_t)args->maxlen, - (void *)(__psunsigned_t)args->mod, - (void *)(__psunsigned_t)args->prod, - (void *)(__psunsigned_t)args->minleft, - (void *)(__psunsigned_t)args->total, - (void *)(__psunsigned_t)args->alignment, - (void *)(__psunsigned_t)args->len, - (void *)((((__psint_t)args->type) << 16) | - (__psint_t)args->otype), - (void *)(__psint_t)((args->wasdel << 3) | - (args->wasfromfl << 2) | - (args->isfl << 1) | - (args->userdata << 0))); -} + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); -/* - * Add an allocation trace entry for a free call. - */ -STATIC void -xfs_alloc_trace_free( - char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* a.g. relative block number */ - xfs_extlen_t len, /* length of extent */ - int isfl, /* set if is freelist allocation/free */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_FREE | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psunsigned_t)agno, - (void *)(__psunsigned_t)agbno, - (void *)(__psunsigned_t)len, - (void *)(__psint_t)isfl, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); -} + pag->pagf_freeblks += len; + be32_add_cpu(&agf->agf_freeblks, len); -/* - * Add an allocation trace entry for modifying an agf. - */ -STATIC void -xfs_alloc_trace_modagf( - char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agf_t *agf, /* new agf value */ - int flags, /* logging flags for agf */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(XFS_ALLOC_KTRACE_MODAGF | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psint_t)flags, - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_seqno), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_length), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flfirst), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_fllast), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_flcount), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_freeblks), - (void *)(__psunsigned_t)be32_to_cpu(agf->agf_longest)); -} + xfs_trans_agblocks_delta(tp, len); + if (unlikely(be32_to_cpu(agf->agf_freeblks) > + be32_to_cpu(agf->agf_length))) + return EFSCORRUPTED; -STATIC void -xfs_alloc_trace_busy( - char *name, /* function tag string */ - char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* a.g. relative block number */ - xfs_extlen_t len, /* length of extent */ - int slot, /* perag Busy slot */ - xfs_trans_t *tp, - int trtype, /* type: add, delete, search */ - int line) /* source line number */ -{ - ktrace_enter(xfs_alloc_trace_buf, - (void *)(__psint_t)(trtype | (line << 16)), - (void *)name, - (void *)str, - (void *)mp, - (void *)(__psunsigned_t)agno, - (void *)(__psunsigned_t)agbno, - (void *)(__psunsigned_t)len, - (void *)(__psint_t)slot, - (void *)tp, - NULL, NULL, NULL, NULL, NULL, NULL, NULL); + xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); + return 0; } -#endif /* XFS_ALLOC_TRACE */ /* * Allocation group level functions. @@ -549,9 +582,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent"; -#endif ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -576,46 +606,36 @@ xfs_alloc_ag_vextent( ASSERT(0); /* NOTREACHED */ } - if (error) + + if (error || args->agbno == NULLAGBLOCK) return error; - /* - * If the allocation worked, need to change the agf structure - * (and log it), and the superblock. - */ - if (args->agbno != NULLAGBLOCK) { - xfs_agf_t *agf; /* allocation group freelist header */ -#ifdef XFS_ALLOC_TRACE - xfs_mount_t *mp = args->mp; -#endif - long slen = (long)args->len; - ASSERT(args->len >= args->minlen && args->len <= args->maxlen); - ASSERT(!(args->wasfromfl) || !args->isfl); - ASSERT(args->agbno % args->alignment == 0); - if (!(args->wasfromfl)) { - - agf = XFS_BUF_TO_AGF(args->agbp); - be32_add(&agf->agf_freeblks, -(args->len)); - xfs_trans_agblocks_delta(args->tp, - -((long)(args->len))); - args->pag->pagf_freeblks -= args->len; - ASSERT(be32_to_cpu(agf->agf_freeblks) <= - be32_to_cpu(agf->agf_length)); - TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); - xfs_alloc_log_agf(args->tp, args->agbp, - XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); - } - if (!args->isfl) - xfs_trans_mod_sb(args->tp, - args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : - XFS_TRANS_SB_FDBLOCKS, -slen); - XFS_STATS_INC(xs_allocx); - XFS_STATS_ADD(xs_allocb, args->len); + ASSERT(args->len >= args->minlen); + ASSERT(args->len <= args->maxlen); + ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(args->agbno % args->alignment == 0); + + if (!args->wasfromfl) { + error = xfs_alloc_update_counters(args->tp, args->pag, + args->agbp, + -((long)(args->len))); + if (error) + return error; + + ASSERT(!xfs_extent_busy_search(args->mp, args->agno, + args->agbno, args->len)); } - return 0; + + if (!args->isfl) { + xfs_trans_mod_sb(args->tp, args->wasdel ? + XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS, + -((long)(args->len))); + } + + XFS_STATS_INC(xs_allocx); + XFS_STATS_ADD(xs_allocb, args->len); + return error; } /* @@ -630,97 +650,194 @@ xfs_alloc_ag_vextent_exact( { xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ - xfs_agblock_t end; /* end of allocated extent */ int error; xfs_agblock_t fbno; /* start block of found extent */ - xfs_agblock_t fend; /* end block of found extent */ xfs_extlen_t flen; /* length of found extent */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_exact"; -#endif + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ int i; /* success/failure of operation */ - xfs_agblock_t maxend; /* end of maximal extent */ - xfs_agblock_t minend; /* end of minimal extent */ - xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); + /* * Allocate/initialize a cursor for the by-number freespace btree. */ - bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); + /* * Lookup bno and minlen in the btree (minlen is irrelevant, really). * Look for the closest free block <= bno, it must contain bno * if any free block does. */ - if ((error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i))) + error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); + if (error) goto error0; - if (!i) { - /* - * Didn't find it, return null. - */ - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + if (!i) + goto not_found; + /* * Grab the freespace record. */ - if ((error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i))) + error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); + if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); ASSERT(fbno <= args->agbno); - minend = args->agbno + args->minlen; - maxend = args->agbno + args->maxlen; - fend = fbno + flen; + /* - * Give up if the freespace isn't long enough for the minimum request. + * Check for overlapping busy extents. */ - if (fend < minend) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - args->agbno = NULLAGBLOCK; - return 0; - } + xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen); + /* - * End of extent will be smaller of the freespace end and the - * maximal requested end. + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. */ - end = XFS_AGBLOCK_MIN(fend, maxend); + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) + goto not_found; + /* + * End of extent will be smaller of the freespace end and the + * maximal requested end. + * * Fix the length according to mod and prod if given. */ - args->len = end - args->agbno; + args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) + - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); - return 0; - } - rlen = args->len; - ASSERT(args->agbno + rlen <= fend); - end = args->agbno + rlen; + if (!xfs_alloc_fix_minleft(args)) + goto not_found; + + ASSERT(args->agbno + args->len <= tend); + /* - * We are allocating agbno for rlen [agbno .. end] + * We are allocating agbno for args->len * Allocate/initialize a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); ASSERT(args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); - if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, - args->agbno, args->len, XFSA_FIXUP_BNO_OK))) { + error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, + args->len, XFSA_FIXUP_BNO_OK); + if (error) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); goto error0; } + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("normal", args); + args->wasfromfl = 0; + trace_xfs_alloc_exact_done(args); + return 0; + +not_found: + /* Didn't find it, return null. */ + xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); + args->agbno = NULLAGBLOCK; + trace_xfs_alloc_exact_notfound(args); return 0; error0: xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); - TRACE_ALLOC("error", args); + trace_xfs_alloc_exact_error(args); + return error; +} + +/* + * Search the btree in a given direction via the search cursor and compare + * the records found against the good extent we've already found. + */ +STATIC int +xfs_alloc_find_best_extent( + struct xfs_alloc_arg *args, /* allocation argument structure */ + struct xfs_btree_cur **gcur, /* good cursor */ + struct xfs_btree_cur **scur, /* searching cursor */ + xfs_agblock_t gdiff, /* difference for search comparison */ + xfs_agblock_t *sbno, /* extent found by search */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ + int dir) /* 0 = search right, 1 = search left */ +{ + xfs_agblock_t new; + xfs_agblock_t sdiff; + int error; + int i; + + /* The good extent is perfect, no need to search. */ + if (!gdiff) + goto out_use_good; + + /* + * Look until we find a better one, run out of space or run off the end. + */ + do { + error = xfs_alloc_get_rec(*scur, sbno, slen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); + + /* + * The good extent is closer than this one. + */ + if (!dir) { + if (*sbnoa >= args->agbno + gdiff) + goto out_use_good; + } else { + if (*sbnoa <= args->agbno - gdiff) + goto out_use_good; + } + + /* + * Same distance, compare length and pick the best. + */ + if (*slena >= args->minlen) { + args->len = XFS_EXTLEN_MIN(*slena, args->maxlen); + xfs_alloc_fix_len(args); + + sdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, + args->userdata, *sbnoa, + *slena, &new); + + /* + * Choose closer size and invalidate other cursor. + */ + if (sdiff < gdiff) + goto out_use_search; + goto out_use_good; + } + + if (!dir) + error = xfs_btree_increment(*scur, 0, &i); + else + error = xfs_btree_decrement(*scur, 0, &i); + if (error) + goto error0; + } while (i); + +out_use_good: + xfs_btree_del_cursor(*scur, XFS_BTREE_NOERROR); + *scur = NULL; + return 0; + +out_use_search: + xfs_btree_del_cursor(*gcur, XFS_BTREE_NOERROR); + *gcur = NULL; + return 0; + +error0: + /* caller invalidates cursors */ return error; } @@ -737,9 +854,6 @@ xfs_alloc_ag_vextent_near( xfs_btree_cur_t *bno_cur_gt; /* cursor for bno btree, right side */ xfs_btree_cur_t *bno_cur_lt; /* cursor for bno btree, left side */ xfs_btree_cur_t *cnt_cur; /* cursor for count btree */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_near"; -#endif xfs_agblock_t gtbno; /* start bno of right side entry */ xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ @@ -752,27 +866,33 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbno; /* start bno of left side entry */ xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ - /*REFERENCED*/ - xfs_agblock_t ltend; /* end bno of left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ -#if defined(DEBUG) && defined(__KERNEL__) + int forced = 0; +#ifdef DEBUG /* * Randomly don't execute the first algorithm. */ int dofirst; /* set to do first algorithm */ - dofirst = random() & 1; + dofirst = prandom_u32() & 1; #endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + /* * Get a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); - ltlen = 0; - bno_cur_lt = bno_cur_gt = NULL; + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); + /* * See if there are any free extents as big as maxlen. */ @@ -788,11 +908,13 @@ xfs_alloc_ag_vextent_near( goto error0; if (i == 0 || ltlen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); return 0; } ASSERT(i == 1); } args->wasfromfl = 0; + /* * First algorithm. * If the requested extent is large wrt the freespaces available @@ -809,8 +931,8 @@ xfs_alloc_ag_vextent_near( xfs_extlen_t blen=0; xfs_agblock_t bnew=0; -#if defined(DEBUG) && defined(__KERNEL__) - if (!dofirst) +#ifdef DEBUG + if (dofirst) break; #endif /* @@ -828,7 +950,7 @@ xfs_alloc_ag_vextent_near( XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (ltlen >= args->minlen) break; - if ((error = xfs_alloc_increment(cnt_cur, 0, &i))) + if ((error = xfs_btree_increment(cnt_cur, 0, &i))) goto error0; } while (i); ASSERT(ltlen >= args->minlen); @@ -838,7 +960,7 @@ xfs_alloc_ag_vextent_near( i = cnt_cur->bc_ptrs[0]; for (j = 1, blen = 0, bdiff = 0; !error && j && (blen < args->maxlen || bdiff > 0); - error = xfs_alloc_increment(cnt_cur, 0, &j)) { + error = xfs_btree_increment(cnt_cur, 0, &j)) { /* * For each entry, decide if it's better than * the previous best entry. @@ -846,9 +968,9 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (!xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena < args->minlen) continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); @@ -856,7 +978,8 @@ xfs_alloc_ag_vextent_near( if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbno, ltlen, <new); + args->alignment, args->userdata, ltbnoa, + ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -878,12 +1001,11 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - ltend = ltbno + ltlen; - ASSERT(ltend <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("nominleft", args); + trace_xfs_alloc_near_nominleft(args); return 0; } blen = args->len; @@ -892,12 +1014,12 @@ xfs_alloc_ag_vextent_near( */ args->agbno = bnew; ASSERT(bnew >= ltbno); - ASSERT(bnew + blen <= ltend); + ASSERT(bnew + blen <= ltbno + ltlen); /* * Set up a cursor for the by-bno tree. */ - bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, - args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, + args->agbp, args->agno, XFS_BTNUM_BNO); /* * Fix up the btree entries. */ @@ -906,7 +1028,8 @@ xfs_alloc_ag_vextent_near( goto error0; xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - TRACE_ALLOC("first", args); + + trace_xfs_alloc_near_first(args); return 0; } /* @@ -924,8 +1047,8 @@ xfs_alloc_ag_vextent_near( /* * Allocate and initialize the cursor for the leftward search. */ - bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); /* * Lookup <= bno to find the leftward search's starting point. */ @@ -948,7 +1071,7 @@ xfs_alloc_ag_vextent_near( * Increment the cursor, so we will point at the entry just right * of the leftward entry if any, or to the leftmost entry. */ - if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; if (!i) { /* @@ -967,11 +1090,11 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena)) + xfs_alloc_compute_aligned(args, ltbno, ltlen, + <bnoa, <lena); + if (ltlena >= args->minlen) break; - if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; if (!i) { xfs_btree_del_cursor(bno_cur_lt, @@ -983,11 +1106,11 @@ xfs_alloc_ag_vextent_near( if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena)) + xfs_alloc_compute_aligned(args, gtbno, gtlen, + >bnoa, >lena); + if (gtlena >= args->minlen) break; - if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) + if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; if (!i) { xfs_btree_del_cursor(bno_cur_gt, @@ -996,211 +1119,65 @@ xfs_alloc_ag_vextent_near( } } } while (bno_cur_lt || bno_cur_gt); + /* * Got both cursors still active, need to find better entry. */ if (bno_cur_lt && bno_cur_gt) { - /* - * Left side is long enough, look for a right side entry. - */ if (ltlena >= args->minlen) { /* - * Fix up the length. + * Left side is good, look for a right side entry. */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff(args->agbno, rlen, - args->alignment, ltbno, ltlen, <new); - /* - * Not perfect. - */ - if (ltdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_gt, >bno, - >len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(gtbno, gtlen, - args->alignment, args->minlen, - >bnoa, >lena); - /* - * The left one is clearly better. - */ - if (gtbnoa >= args->agbno + ltdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (gtlena >= args->minlen) { - args->len = - XFS_EXTLEN_MIN(gtlena, - args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - gtbno, gtlen, >new); - /* - * Right side is better. - */ - if (gtdiff < ltdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - /* - * Left side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - break; - } - /* - * Fell off the right end. - */ - if ((error = xfs_alloc_increment( - bno_cur_gt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - break; - } - } - } - /* - * The left side is perfect, trash the right side. - */ - else { - xfs_btree_del_cursor(bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - } - /* - * It's the right side that was found first, look left. - */ - else { + ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, ltbnoa, + ltlena, <new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_lt, &bno_cur_gt, + ltdiff, >bno, >len, + >bnoa, >lena, + 0 /* search right */); + } else { + ASSERT(gtlena >= args->minlen); + /* - * Fix up the length. + * Right side is good, look for a left side entry. */ args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); - rlen = args->len; - gtdiff = xfs_alloc_compute_diff(args->agbno, rlen, - args->alignment, gtbno, gtlen, >new); - /* - * Right side entry isn't perfect. - */ - if (gtdiff) { - /* - * Look until we find a better one, run out of - * space, or run off the end. - */ - while (bno_cur_lt && bno_cur_gt) { - if ((error = xfs_alloc_get_rec( - bno_cur_lt, <bno, - <len, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(ltbno, ltlen, - args->alignment, args->minlen, - <bnoa, <lena); - /* - * The right one is clearly better. - */ - if (ltbnoa <= args->agbno - gtdiff) { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - /* - * If we reach a big enough entry, - * compare the two and pick the best. - */ - if (ltlena >= args->minlen) { - args->len = XFS_EXTLEN_MIN( - ltlena, args->maxlen); - xfs_alloc_fix_len(args); - rlen = args->len; - ltdiff = xfs_alloc_compute_diff( - args->agbno, rlen, - args->alignment, - ltbno, ltlen, <new); - /* - * Left side is better. - */ - if (ltdiff < gtdiff) { - xfs_btree_del_cursor( - bno_cur_gt, - XFS_BTREE_NOERROR); - bno_cur_gt = NULL; - } - /* - * Right side is better. - */ - else { - xfs_btree_del_cursor( - bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } - break; - } - /* - * Fell off the left end. - */ - if ((error = xfs_alloc_decrement( - bno_cur_lt, 0, &i))) - goto error0; - if (!i) { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - break; - } - } - } - /* - * The right side is perfect, trash the left side. - */ - else { - xfs_btree_del_cursor(bno_cur_lt, - XFS_BTREE_NOERROR); - bno_cur_lt = NULL; - } + gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, + args->alignment, args->userdata, gtbnoa, + gtlena, >new); + + error = xfs_alloc_find_best_extent(args, + &bno_cur_gt, &bno_cur_lt, + gtdiff, <bno, <len, + <bnoa, <lena, + 1 /* search left */); } + + if (error) + goto error0; } + /* * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { - TRACE_ALLOC("neither", args); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; } + /* * At this point we have selected a freespace entry, either to the * left or to the right. If it's on the right, copy all the @@ -1217,35 +1194,41 @@ xfs_alloc_ag_vextent_near( j = 1; } else j = 0; + /* * Fix up the length and compute the useful address. */ - ltend = ltbno + ltlen; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) { - TRACE_ALLOC("nominleft", args); + trace_xfs_alloc_near_nominleft(args); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); return 0; } rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, - ltlen, <new); + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + args->userdata, ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltend); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->agbno = ltnew; + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, ltnew, rlen, XFSA_FIXUP_BNO_OK))) goto error0; - TRACE_ALLOC(j ? "gt" : "lt", args); + + if (j) + trace_xfs_alloc_near_greater(args); + else + trace_xfs_alloc_near_lesser(args); + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_near_error(args); if (cnt_cur != NULL) xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); if (bno_cur_lt != NULL) @@ -1270,56 +1253,95 @@ xfs_alloc_ag_vextent_size( int error; /* error result */ xfs_agblock_t fbno; /* start of found freespace */ xfs_extlen_t flen; /* length of found freespace */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_size"; -#endif int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +restart: /* * Allocate and initialize a cursor for the by-size btree. */ - cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_CNT, NULL, 0); + cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + /* * Look for an entry >= maxlen+alignment-1 blocks. */ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen + args->alignment - 1, &i))) goto error0; + /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, - &flen, &i))) + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) goto error0; if (i == 0 || flen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("noentry", args); + trace_xfs_alloc_size_noentry(args); return 0; } ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } } - /* - * There's a freespace as big as maxlen+alignment-1, get it. - */ - else { - if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } + /* * In the first case above, we got the last entry in the * by-size btree. Now we check to see if the space hits maxlen * once aligned; if not, we search left for something better. * This can't happen in the second case above. */ - xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, - &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); @@ -1334,7 +1356,7 @@ xfs_alloc_ag_vextent_size( bestflen = flen; bestfbno = fbno; for (;;) { - if ((error = xfs_alloc_decrement(cnt_cur, 0, &i))) + if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) goto error0; if (i == 0) break; @@ -1344,8 +1366,8 @@ xfs_alloc_ag_vextent_size( XFS_WANT_CORRUPTED_GOTO(i == 1, error0); if (flen < bestrlen) break; - xfs_alloc_compute_aligned(fbno, flen, args->alignment, - args->minlen, &rbno, &rlen); + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), @@ -1373,20 +1395,26 @@ xfs_alloc_ag_vextent_size( * Fix up the length. */ args->len = rlen; - xfs_alloc_fix_len(args); - if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - TRACE_ALLOC("nominleft", args); - args->agbno = NULLAGBLOCK; - return 0; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); /* * Allocate and initialize a cursor for the by-block tree. */ - bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp, - args->agno, XFS_BTNUM_BNO, NULL, 0); + bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, + args->agno, XFS_BTNUM_BNO); if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, rbno, rlen, XFSA_FIXUP_CNT_OK))) goto error0; @@ -1399,16 +1427,22 @@ xfs_alloc_ag_vextent_size( args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); - TRACE_ALLOC("normal", args); + trace_xfs_alloc_size_done(args); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_size_error(args); if (cnt_cur) xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; } /* @@ -1427,12 +1461,9 @@ xfs_alloc_ag_vextent_small( int error; xfs_agblock_t fbno; xfs_extlen_t flen; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_ag_vextent_small"; -#endif int i; - if ((error = xfs_alloc_decrement(ccur, 0, &i))) + if ((error = xfs_btree_decrement(ccur, 0, &i))) goto error0; if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) @@ -1447,9 +1478,13 @@ xfs_alloc_ag_vextent_small( else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { - if ((error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno))) + error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); + if (error) goto error0; if (fbno != NULLAGBLOCK) { + xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + if (args->userdata) { xfs_buf_t *bp; @@ -1464,7 +1499,7 @@ xfs_alloc_ag_vextent_small( be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); args->wasfromfl = 1; - TRACE_ALLOC("freelist", args); + trace_xfs_alloc_small_freelist(args); *stat = 0; return 0; } @@ -1477,24 +1512,26 @@ xfs_alloc_ag_vextent_small( /* * Can't allocate from the freelist for some reason. */ - else + else { + fbno = NULLAGBLOCK; flen = 0; + } /* * Can't do the allocation, give up. */ if (flen < args->minlen) { args->agbno = NULLAGBLOCK; - TRACE_ALLOC("notenough", args); + trace_xfs_alloc_small_notenough(args); flen = 0; } *fbnop = fbno; *flenp = flen; *stat = 1; - TRACE_ALLOC("normal", args); + trace_xfs_alloc_small_done(args); return 0; error0: - TRACE_ALLOC("error", args); + trace_xfs_alloc_small_error(args); return error; } @@ -1513,9 +1550,6 @@ xfs_free_ag_extent( xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ int error; /* error return value */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_free_ag_extent"; -#endif xfs_agblock_t gtbno; /* start of right neighbor block */ xfs_extlen_t gtlen; /* length of right neighbor block */ int haveleft; /* have a left neighbor block */ @@ -1526,13 +1560,13 @@ xfs_free_ag_extent( xfs_mount_t *mp; /* mount point struct for filesystem */ xfs_agblock_t nbno; /* new starting block of freespace */ xfs_extlen_t nlen; /* new length of freespace */ + xfs_perag_t *pag; /* per allocation group data */ mp = tp->t_mountp; /* * Allocate and initialize a cursor for the by-block btree. */ - bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL, - 0); + bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO); cnt_cur = NULL; /* * Look for a neighboring block on the left (lower block numbers) @@ -1565,7 +1599,7 @@ xfs_free_ag_extent( * Look for a neighboring block on the right (higher block numbers) * that is contiguous with this space. */ - if ((error = xfs_alloc_increment(bno_cur, 0, &haveright))) + if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) goto error0; if (haveright) { /* @@ -1591,8 +1625,7 @@ xfs_free_ag_extent( /* * Now allocate and initialize a cursor for the by-size tree. */ - cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL, - 0); + cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT); /* * Have both left and right contiguous neighbors. * Merge all three into a single free block. @@ -1604,7 +1637,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* @@ -1613,19 +1646,19 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Delete the old by-block entry for the right block. */ - if ((error = xfs_alloc_delete(bno_cur, &i))) + if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ - if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); #ifdef DEBUG @@ -1664,14 +1697,14 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ - if ((error = xfs_alloc_decrement(bno_cur, 0, &i))) + if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); nbno = ltbno; @@ -1690,7 +1723,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_delete(cnt_cur, &i))) + if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); /* @@ -1709,7 +1742,7 @@ xfs_free_ag_extent( else { nbno = bno; nlen = len; - if ((error = xfs_alloc_insert(bno_cur, &i))) + if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); } @@ -1721,55 +1754,32 @@ xfs_free_ag_extent( if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 0, error0); - if ((error = xfs_alloc_insert(cnt_cur, &i))) + if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; + /* * Update the freespace totals in the ag and superblock. */ - { - xfs_agf_t *agf; - xfs_perag_t *pag; /* per allocation group data */ - - agf = XFS_BUF_TO_AGF(agbp); - pag = &mp->m_perag[agno]; - be32_add(&agf->agf_freeblks, len); - xfs_trans_agblocks_delta(tp, len); - pag->pagf_freeblks += len; - XFS_WANT_CORRUPTED_GOTO( - be32_to_cpu(agf->agf_freeblks) <= - be32_to_cpu(agf->agf_length), - error0); - TRACE_MODAGF(NULL, agf, XFS_AGF_FREEBLKS); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); - if (!isfl) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); - XFS_STATS_INC(xs_freex); - XFS_STATS_ADD(xs_freeb, len); - } - TRACE_FREE(haveleft ? - (haveright ? "both" : "left") : - (haveright ? "right" : "none"), - agno, bno, len, isfl); - - /* - * Since blocks move to the free list without the coordination - * used in xfs_bmap_finish, we can't allow block to be available - * for reallocation and non-transaction writing (user data) - * until we know that the transaction that moved it to the free - * list is permanently on disk. We track the blocks by declaring - * these blocks as "busy"; the busy list is maintained on a per-ag - * basis and each transaction records which entries should be removed - * when the iclog commits to disk. If a busy block is allocated, - * the iclog is pushed up to the LSN that freed the block. - */ - xfs_alloc_mark_busy(tp, agno, bno, len); + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_perag_put(pag); + if (error) + goto error0; + + if (!isfl) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); + XFS_STATS_INC(xs_freex); + XFS_STATS_ADD(xs_freeb, len); + + trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + return 0; error0: - TRACE_FREE("error", agno, bno, len, isfl); + trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1805,6 +1815,25 @@ xfs_alloc_compute_maxlevels( } /* + * Find the length of the longest extent in an AG. + */ +xfs_extlen_t +xfs_alloc_longest_free_extent( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + xfs_extlen_t need, delta = 0; + + need = XFS_MIN_FREELIST_PAG(pag, mp); + if (need > pag->pagf_flcount) + delta = need - pag->pagf_flcount; + + if (pag->pagf_longest > delta) + return pag->pagf_longest - delta; + return pag->pagf_flcount > 0 || pag->pagf_longest > 0; +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -1856,15 +1885,12 @@ xfs_alloc_fix_freelist( } if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? need - pag->pagf_flcount : 0; /* * If it looks like there isn't a long enough extent, or enough * total blocks, reject it. */ - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || pag->pagf_longest > 0); + need = XFS_MIN_FREELIST_PAG(pag, mp); + longest = xfs_alloc_longest_free_extent(mp, pag); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest || ((int)(pag->pagf_freeblks + pag->pagf_flcount - @@ -1921,7 +1947,8 @@ xfs_alloc_fix_freelist( while (be32_to_cpu(agf->agf_flcount) > need) { xfs_buf_t *bp; - if ((error = xfs_alloc_get_freelist(tp, agbp, &bno))) + error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); + if (error) return error; if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) return error; @@ -1931,12 +1958,11 @@ xfs_alloc_fix_freelist( /* * Initialize the args structure. */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.mod = targs.minleft = targs.wasdel = targs.userdata = - targs.minalignslop = 0; targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; @@ -1971,8 +1997,9 @@ xfs_alloc_fix_freelist( * Put each allocated block on the list. */ for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { - if ((error = xfs_alloc_put_freelist(tp, agbp, agflbp, - bno))) + error = xfs_alloc_put_freelist(tp, agbp, + agflbp, bno, 0); + if (error) return error; } } @@ -1989,23 +2016,22 @@ int /* error */ xfs_alloc_get_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop) /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk) /* destination is a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. freelist structure */ xfs_buf_t *agflbp;/* buffer for a.g. freelist structure */ xfs_agblock_t bno; /* block number returned */ + __be32 *agfl_bno; int error; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_get_freelist"; -#endif - xfs_mount_t *mp; /* mount structure */ + int logflags; + xfs_mount_t *mp = tp->t_mountp; xfs_perag_t *pag; /* per allocation group data */ - agf = XFS_BUF_TO_AGF(agbp); /* * Freelist is empty, give up. */ + agf = XFS_BUF_TO_AGF(agbp); if (!agf->agf_flcount) { *bnop = NULLAGBLOCK; return 0; @@ -2013,36 +2039,38 @@ xfs_alloc_get_freelist( /* * Read the array of free blocks. */ - mp = tp->t_mountp; - if ((error = xfs_alloc_read_agfl(mp, tp, - be32_to_cpu(agf->agf_seqno), &agflbp))) + error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), + &agflbp); + if (error) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); + + /* * Get the block number and update the data structures. */ - bno = INT_GET(agfl->agfl_bno[be32_to_cpu(agf->agf_flfirst)], ARCH_CONVERT); - be32_add(&agf->agf_flfirst, 1); + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); + be32_add_cpu(&agf->agf_flfirst, 1); xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) agf->agf_flfirst = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; - be32_add(&agf->agf_flcount, -1); + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; - TRACE_MODAGF(NULL, agf, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT); + xfs_perag_put(pag); + + logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, 1); + pag->pagf_btreeblks++; + logflags |= XFS_AGF_BTREEBLKS; + } + + xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; - /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. - */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); return 0; } @@ -2069,9 +2097,15 @@ xfs_alloc_log_agf( offsetof(xfs_agf_t, agf_flcount), offsetof(xfs_agf_t, agf_freeblks), offsetof(xfs_agf_t, agf_longest), + offsetof(xfs_agf_t, agf_btreeblks), + offsetof(xfs_agf_t, agf_uuid), sizeof(xfs_agf_t) }; + trace_xfs_agf(tp->t_mountp, XFS_BUF_TO_AGF(bp), fields, _RET_IP_); + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); + xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); } @@ -2104,17 +2138,17 @@ xfs_alloc_put_freelist( xfs_trans_t *tp, /* transaction pointer */ xfs_buf_t *agbp, /* buffer for a.g. freelist header */ xfs_buf_t *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno) /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk) /* block came from a AGF btree */ { xfs_agf_t *agf; /* a.g. freespace structure */ - xfs_agfl_t *agfl; /* a.g. free block array */ - xfs_agblock_t *blockp;/* pointer to array entry */ + __be32 *blockp;/* pointer to array entry */ int error; -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_put_freelist"; -#endif + int logflags; xfs_mount_t *mp; /* mount structure */ xfs_perag_t *pag; /* per allocation group data */ + __be32 *agfl_bno; + int startoff; agf = XFS_BUF_TO_AGF(agbp); mp = tp->t_mountp; @@ -2122,92 +2156,198 @@ xfs_alloc_put_freelist( if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno), &agflbp))) return error; - agfl = XFS_BUF_TO_AGFL(agflbp); - be32_add(&agf->agf_fllast, 1); + be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; - be32_add(&agf->agf_flcount, 1); + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); + be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; + + logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; + if (btreeblk) { + be32_add_cpu(&agf->agf_btreeblks, -1); + pag->pagf_btreeblks--; + logflags |= XFS_AGF_BTREEBLKS; + } + xfs_perag_put(pag); + + xfs_alloc_log_agf(tp, agbp, logflags); + ASSERT(be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)); - blockp = &agfl->agfl_bno[be32_to_cpu(agf->agf_fllast)]; - INT_SET(*blockp, ARCH_CONVERT, bno); - TRACE_MODAGF(NULL, agf, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); - xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLLAST | XFS_AGF_FLCOUNT); - xfs_trans_log_buf(tp, agflbp, - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl), - (int)((xfs_caddr_t)blockp - (xfs_caddr_t)agfl + - sizeof(xfs_agblock_t) - 1)); + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; + *blockp = cpu_to_be32(bno); + startoff = (char *)blockp - (char *)agflbp->b_addr; + + xfs_alloc_log_agf(tp, agbp, logflags); + + xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); + xfs_trans_log_buf(tp, agflbp, startoff, + startoff + sizeof(xfs_agblock_t) - 1); return 0; } +static bool +xfs_agf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) + { + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_uuid)) + return false; + + if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && + XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && + be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && + be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && + be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) + return false; + + /* + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. + */ + if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) + return false; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) + return false; + + return true;; + +} + +static void +xfs_agf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, + XFS_ERRTAG_ALLOC_READ_AGF, + XFS_RANDOM_ALLOC_READ_AGF)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agf_verify(mp, bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agf_buf_ops = { + .verify_read = xfs_agf_read_verify, + .verify_write = xfs_agf_write_verify, +}; + /* * Read in the allocation group header (free/alloc section). */ int /* error */ -xfs_alloc_read_agf( - xfs_mount_t *mp, /* mount point structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - int flags, /* XFS_ALLOC_FLAG_... */ - xfs_buf_t **bpp) /* buffer for the ag freelist header */ +xfs_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_BUF_ */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ { - xfs_agf_t *agf; /* ag freelist header */ - int agf_ok; /* set if agf is consistent */ - xfs_buf_t *bp; /* return value */ - xfs_perag_t *pag; /* per allocation group data */ int error; + trace_xfs_read_agf(mp, agno); + ASSERT(agno != NULLAGNUMBER); error = xfs_trans_read_buf( mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), - (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U, - &bp); + XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops); if (error) return error; - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (!bp) { - *bpp = NULL; + if (!*bpp) return 0; - } - /* - * Validate the magic number of the agf block. - */ - agf = XFS_BUF_TO_AGF(bp); - agf_ok = - be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC && - XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && - be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && - be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) && - be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp); - if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF, - XFS_RANDOM_ALLOC_READ_AGF))) { - XFS_CORRUPTION_ERROR("xfs_alloc_read_agf", - XFS_ERRLEVEL_LOW, mp, agf); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - pag = &mp->m_perag[agno]; + + ASSERT(!(*bpp)->b_error); + xfs_buf_set_ref(*bpp, XFS_AGF_REF); + return 0; +} + +/* + * Read in the allocation group header (free/alloc section). + */ +int /* error */ +xfs_alloc_read_agf( + struct xfs_mount *mp, /* mount point structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + int flags, /* XFS_ALLOC_FLAG_... */ + struct xfs_buf **bpp) /* buffer for the ag freelist header */ +{ + struct xfs_agf *agf; /* ag freelist header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_alloc_read_agf(mp, agno); + + ASSERT(agno != NULLAGNUMBER); + error = xfs_read_agf(mp, tp, agno, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, + bpp); + if (error) + return error; + if (!*bpp) + return 0; + ASSERT(!(*bpp)->b_error); + + agf = XFS_BUF_TO_AGF(*bpp); + pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); + pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); pag->pagf_longest = be32_to_cpu(agf->agf_longest); pag->pagf_levels[XFS_BTNUM_BNOi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); - spinlock_init(&pag->pagb_lock, "xfspagb"); - pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * - sizeof(xfs_perag_busy_t), KM_SLEEP); + spin_lock_init(&pag->pagb_lock); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG else if (!XFS_FORCED_SHUTDOWN(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); + ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == @@ -2216,8 +2356,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF); - *bpp = bp; + xfs_perag_put(pag); return 0; } @@ -2233,9 +2372,6 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ -#ifdef XFS_ALLOC_TRACE - static char fname[] = "xfs_alloc_vextent"; -#endif xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ @@ -2267,7 +2403,7 @@ xfs_alloc_vextent( args->minlen > args->maxlen || args->minlen > agsize || args->mod >= args->prod) { args->fsbno = NULLFSBLOCK; - TRACE_ALLOC("badargs", args); + trace_xfs_alloc_vextent_badargs(args); return 0; } minleft = args->minleft; @@ -2280,24 +2416,21 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); args->minleft = minleft; if (error) { - TRACE_ALLOC("nofix", args); + trace_xfs_alloc_vextent_nofix(args); goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); - TRACE_ALLOC("noagbp", args); + trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2349,14 +2482,13 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); for (;;) { - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); args->minleft = minleft; if (error) { - TRACE_ALLOC("nofix", args); + trace_xfs_alloc_vextent_nofix(args); goto error0; } /* @@ -2367,7 +2499,9 @@ xfs_alloc_vextent( goto error0; break; } - TRACE_ALLOC("loopfailed", args); + + trace_xfs_alloc_vextent_loopfailed(args); + /* * Didn't work, figure out the next iteration. */ @@ -2394,7 +2528,7 @@ xfs_alloc_vextent( if (args->agno == sagno) { if (no_min == 1) { args->agbno = NULLAGBLOCK; - TRACE_ALLOC("allfailed", args); + trace_xfs_alloc_vextent_allfailed(args); break; } if (flags == 0) { @@ -2408,8 +2542,8 @@ xfs_alloc_vextent( } } } + xfs_perag_put(args->pag); } - up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2435,9 +2569,10 @@ xfs_alloc_vextent( args->len); #endif } + xfs_perag_put(args->pag); return 0; error0: - up_read(&mp->m_peraglock); + xfs_perag_put(args->pag); return error; } @@ -2459,166 +2594,37 @@ xfs_free_extent( memset(&args, 0, sizeof(xfs_alloc_arg_t)); args.tp = tp; args.mp = tp->t_mountp; - args.agno = XFS_FSB_TO_AGNO(args.mp, bno); - ASSERT(args.agno < args.mp->m_sb.sb_agcount); - args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); - args.pag = &args.mp->m_perag[args.agno]; - if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) - goto error0; -#ifdef DEBUG - ASSERT(args.agbp != NULL); - ASSERT((args.agbno + len) <= - be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); -#endif - error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); -error0: - up_read(&args.mp->m_peraglock); - return error; -} - - -/* - * AG Busy list management - * The busy list contains block ranges that have been freed but whose - * transactions have not yet hit disk. If any block listed in a busy - * list is reused, the transaction that freed it must be forced to disk - * before continuing to use the block. - * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list - */ -void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - int n; - SPLDECL(s); - - mp = tp->t_mountp; - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); - - /* search pagb_list for an open slot */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } - } - - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &mp->m_perag[agno].pagb_list[n]; - mp->m_perag[agno].pagb_count++; - TRACE_BUSY("xfs_alloc_mark_busy", "got", agno, bno, len, n, tp); - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { - TRACE_BUSY("xfs_alloc_mark_busy", "FULL", agno, bno, len, -1, tp); - /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. - */ - xfs_trans_set_sync(tp); - } - - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); -} - -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) -{ - xfs_mount_t *mp; - xfs_perag_busy_t *list; - SPLDECL(s); - - mp = tp->t_mountp; - - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); - list = mp->m_perag[agno].pagb_list; - - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - if (list[idx].busy_tp == tp) { - TRACE_UNBUSY("xfs_alloc_clear_busy", "found", agno, idx, tp); - list[idx].busy_tp = NULL; - mp->m_perag[agno].pagb_count--; - } else { - TRACE_UNBUSY("xfs_alloc_clear_busy", "missing", agno, idx, tp); - } - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); -} - - -/* - * returns non-zero if any of (agno,bno):len is in a busy list - */ -STATIC int -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) -{ - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - int n; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn; - int cnt; - SPLDECL(s); - - mp = tp->t_mountp; - - s = mutex_spinlock(&mp->m_perag[agno].pagb_lock); - cnt = mp->m_perag[agno].pagb_count; + /* + * validate that the block number is legal - the enables us to detect + * and handle a silent filesystem corruption rather than crashing. + */ + args.agno = XFS_FSB_TO_AGNO(args.mp, bno); + if (args.agno >= args.mp->m_sb.sb_agcount) + return EFSCORRUPTED; - uend = bno + len - 1; + args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); + if (args.agbno >= args.mp->m_sb.sb_agblocks) + return EFSCORRUPTED; - /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - cnt; bsy++, n++) { + args.pag = xfs_perag_get(args.mp, args.agno); + ASSERT(args.pag); - /* - * (start1,length1) within (start2, length2) - */ - if (bsy->busy_tp != NULL) { - bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || - (uend < bsy->busy_start)) { - cnt--; - } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", - "found1", agno, bno, len, n, - tp); - break; - } - } - } + error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + if (error) + goto error0; - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (cnt) { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); - lsn = bsy->busy_tp->t_commit_lsn; - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); - xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); - } else { - TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); - n = -1; - mutex_spinunlock(&mp->m_perag[agno].pagb_lock, s); + /* validate the extent size is legal now we have the agf locked */ + if (args.agbno + len > + be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { + error = EFSCORRUPTED; + goto error0; } - return n; + error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); +error0: + xfs_perag_put(args.pag); + return error; } diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 5a4256120cc..feacb061bab 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -19,23 +19,35 @@ #define __XFS_ALLOC_H__ struct xfs_buf; +struct xfs_btree_cur; struct xfs_mount; struct xfs_perag; struct xfs_trans; +extern struct workqueue_struct *xfs_alloc_wq; + /* * Freespace allocation types. Argument to xfs_alloc_[v]extent. */ -typedef enum xfs_alloctype -{ - XFS_ALLOCTYPE_ANY_AG, /* allocate anywhere, use rotor */ - XFS_ALLOCTYPE_FIRST_AG, /* ... start at ag 0 */ - XFS_ALLOCTYPE_START_AG, /* anywhere, start in this a.g. */ - XFS_ALLOCTYPE_THIS_AG, /* anywhere in this a.g. */ - XFS_ALLOCTYPE_START_BNO, /* near this block else anywhere */ - XFS_ALLOCTYPE_NEAR_BNO, /* in this a.g. and near this block */ - XFS_ALLOCTYPE_THIS_BNO /* at exactly this block */ -} xfs_alloctype_t; +#define XFS_ALLOCTYPE_ANY_AG 0x01 /* allocate anywhere, use rotor */ +#define XFS_ALLOCTYPE_FIRST_AG 0x02 /* ... start at ag 0 */ +#define XFS_ALLOCTYPE_START_AG 0x04 /* anywhere, start in this a.g. */ +#define XFS_ALLOCTYPE_THIS_AG 0x08 /* anywhere in this a.g. */ +#define XFS_ALLOCTYPE_START_BNO 0x10 /* near this block else anywhere */ +#define XFS_ALLOCTYPE_NEAR_BNO 0x20 /* in this a.g. and near this block */ +#define XFS_ALLOCTYPE_THIS_BNO 0x40 /* at exactly this block */ + +/* this should become an enum again when the tracing code is fixed */ +typedef unsigned int xfs_alloctype_t; + +#define XFS_ALLOC_TYPES \ + { XFS_ALLOCTYPE_ANY_AG, "ANY_AG" }, \ + { XFS_ALLOCTYPE_FIRST_AG, "FIRST_AG" }, \ + { XFS_ALLOCTYPE_START_AG, "START_AG" }, \ + { XFS_ALLOCTYPE_THIS_AG, "THIS_AG" }, \ + { XFS_ALLOCTYPE_START_BNO, "START_BNO" }, \ + { XFS_ALLOCTYPE_NEAR_BNO, "NEAR_BNO" }, \ + { XFS_ALLOCTYPE_THIS_BNO, "THIS_BNO" } /* * Flags for xfs_alloc_fix_freelist. @@ -64,6 +76,22 @@ typedef enum xfs_alloctype #define XFS_ALLOC_SET_ASIDE(mp) (4 + ((mp)->m_sb.sb_agcount * 4)) /* + * When deciding how much space to allocate out of an AG, we limit the + * allocation maximum size to the size the AG. However, we cannot use all the + * blocks in the AG - some are permanently used by metadata. These + * blocks are generally: + * - the AG superblock, AGF, AGI and AGFL + * - the AGF (bno and cnt) and AGI btree root blocks + * - 4 blocks on the AGFL according to XFS_ALLOC_SET_ASIDE() limits + * + * The AG headers are sector sized, so the amount of space they take up is + * dependent on filesystem geometry. The others are all single blocks. + */ +#define XFS_ALLOC_AG_MAX_USABLE(mp) \ + ((mp)->m_sb.sb_agblocks - XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)) - 7) + + +/* * Argument structure for xfs_alloc routines. * This is turned into a structure to avoid having 20 arguments passed * down several levels of the stack. @@ -100,26 +128,12 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ - -#ifdef __KERNEL__ - -#if defined(XFS_ALLOC_TRACE) /* - * Allocation tracing buffer size. + * Find the length of the longest extent in an AG. */ -#define XFS_ALLOC_TRACE_SIZE 4096 -extern ktrace_t *xfs_alloc_trace_buf; - -/* - * Types for alloc tracing. - */ -#define XFS_ALLOC_KTRACE_ALLOC 1 -#define XFS_ALLOC_KTRACE_FREE 2 -#define XFS_ALLOC_KTRACE_MODAGF 3 -#define XFS_ALLOC_KTRACE_BUSY 4 -#define XFS_ALLOC_KTRACE_UNBUSY 5 -#define XFS_ALLOC_KTRACE_BUSYSEARCH 6 -#endif +xfs_extlen_t +xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag); /* * Compute and fill in value of m_ag_maxlevels. @@ -136,7 +150,8 @@ int /* error */ xfs_alloc_get_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer containing the agf structure */ - xfs_agblock_t *bnop); /* block address retrieved from freelist */ + xfs_agblock_t *bnop, /* block address retrieved from freelist */ + int btreeblk); /* destination is a AGF btree */ /* * Log the given fields from the agf structure. @@ -165,7 +180,8 @@ xfs_alloc_put_freelist( struct xfs_trans *tp, /* transaction pointer */ struct xfs_buf *agbp, /* buffer for a.g. freelist header */ struct xfs_buf *agflbp,/* buffer for a.g. free block array */ - xfs_agblock_t bno); /* block being freed */ + xfs_agblock_t bno, /* block being freed */ + int btreeblk); /* owner was a AGF btree */ /* * Read in the allocation group header (free/alloc section). @@ -194,18 +210,25 @@ xfs_free_extent( xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len); /* length of extent */ -void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +int /* error */ +xfs_alloc_lookup_le( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - int idx); - - -#endif /* __KERNEL__ */ +int /* error */ +xfs_alloc_lookup_ge( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* starting block of extent */ + xfs_extlen_t len, /* length of extent */ + int *stat); /* success/failure */ + +int /* error */ +xfs_alloc_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t *bno, /* output: starting block of extent */ + xfs_extlen_t *len, /* output: length of extent */ + int *stat); /* output: success/failure */ #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 7446556e802..8358f1ded94 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -17,2185 +17,488 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" +#include "xfs_extent_busy.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" -/* - * Prototypes for internal functions. - */ - -STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int); -STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *); -STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *, - xfs_alloc_key_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int); -/* - * Internal functions. - */ +STATIC struct xfs_btree_cur * +xfs_allocbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} -/* - * Single level of the xfs_alloc_delete record deletion routine. - * Delete record pointed to by cur/level. - * Remove the record from its block then rebalance the tree. - * Return 0 for error, 1 for done, 2 to go on to the next level. - */ -STATIC int /* error */ -xfs_alloc_delrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level removing record from */ - int *stat) /* fail/done/go-on */ +STATIC void +xfs_allocbt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int inc) { - xfs_agf_t *agf; /* allocation group freelist header */ - xfs_alloc_block_t *block; /* btree block record/key lives in */ - xfs_agblock_t bno; /* btree block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* kp points here if block is level 0 */ - xfs_agblock_t lbno; /* left block's block number */ - xfs_buf_t *lbp; /* left block's buffer pointer */ - xfs_alloc_block_t *left; /* left btree block */ - xfs_alloc_key_t *lkp=NULL; /* left block key pointer */ - xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */ - int lrecs=0; /* number of records in left block */ - xfs_alloc_rec_t *lrp; /* left block record pointer */ - xfs_mount_t *mp; /* mount structure */ - int ptr; /* index in btree block for this rec */ - xfs_agblock_t rbno; /* right block's block number */ - xfs_buf_t *rbp; /* right block's buffer pointer */ - xfs_alloc_block_t *right; /* right btree block */ - xfs_alloc_key_t *rkp; /* right block key pointer */ - xfs_alloc_ptr_t *rpp; /* right block address pointer */ - int rrecs=0; /* number of records in right block */ - xfs_alloc_rec_t *rrp; /* right block record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + int btnum = cur->bc_btnum; + struct xfs_perag *pag = xfs_perag_get(cur->bc_mp, seqno); - /* - * Get the index of the entry being deleted, check for nothing there. - */ - ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - *stat = 0; - return 0; - } - /* - * Get the buffer & block containing the record or key/ptr. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Fail if we're off the end of the block. - */ - if (ptr > be16_to_cpu(block->bb_numrecs)) { - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_abt_delrec); - /* - * It's a nonleaf. Excise the key and ptr being deleted, by - * sliding the entries past them down one. - * Log the changed areas of the block. - */ - if (level > 0) { - lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < be16_to_cpu(block->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - if (ptr < be16_to_cpu(block->bb_numrecs)) { - memmove(&lkp[ptr - 1], &lkp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lkp)); - memmove(&lpp[ptr - 1], &lpp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lpp)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); - } - } - /* - * It's a leaf. Excise the record being deleted, by sliding the - * entries past it down one. Log the changed areas of the block. - */ - else { - lrp = XFS_ALLOC_REC_ADDR(block, 1, cur); - if (ptr < be16_to_cpu(block->bb_numrecs)) { - memmove(&lrp[ptr - 1], &lrp[ptr], - (be16_to_cpu(block->bb_numrecs) - ptr) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs) - 1); - } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - key.ar_startblock = lrp->ar_startblock; - key.ar_blockcount = lrp->ar_blockcount; - lkp = &key; - } - } - /* - * Decrement and log the number of entries in the block. - */ - be16_add(&block->bb_numrecs, -1); - xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * See if the longest free extent in the allocation group was - * changed by this operation. True if it's the by-size btree, and - * this is the leaf level, and there is no right sibling block, - * and this was the last record. - */ - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - mp = cur->bc_mp; - - if (level == 0 && - cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr > be16_to_cpu(block->bb_numrecs)) { - ASSERT(ptr == be16_to_cpu(block->bb_numrecs) + 1); - /* - * There are still records in the block. Grab the size - * from the last one. - */ - if (be16_to_cpu(block->bb_numrecs)) { - rrp = XFS_ALLOC_REC_ADDR(block, be16_to_cpu(block->bb_numrecs), cur); - agf->agf_longest = rrp->ar_blockcount; - } - /* - * No free extents left. - */ - else - agf->agf_longest = 0; - mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest = - be32_to_cpu(agf->agf_longest); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); - } - /* - * Is this the root level? If so, we're almost done. - */ - if (level == cur->bc_nlevels - 1) { - /* - * If this is the root level, - * and there's only one entry left, - * and it's NOT the leaf level, - * then we can get rid of this level. - */ - if (be16_to_cpu(block->bb_numrecs) == 1 && level > 0) { - /* - * lpp is still set to the first pointer in the block. - * Make it the new root of the btree. - */ - bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]); - agf->agf_roots[cur->bc_btnum] = *lpp; - be32_add(&agf->agf_levels[cur->bc_btnum], -1); - mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--; - /* - * Put this buffer/block on the ag's freelist. - */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, - cur->bc_private.a.agbp, NULL, bno))) - return error; - /* - * Since blocks move to the free list without the - * coordination used in xfs_bmap_finish, we can't allow - * block to be available for reallocation and - * non-transaction writing (user data) until we know - * that the transaction that moved it to the free list - * is permanently on disk. We track the blocks by - * declaring these blocks as "busy"; the busy list is - * maintained on a per-ag basis and each transaction - * records which entries should be removed when the - * iclog commits to disk. If a busy block is - * allocated, the iclog is pushed up to the LSN - * that freed the block. - */ - xfs_alloc_mark_busy(cur->bc_tp, - be32_to_cpu(agf->agf_seqno), bno, 1); - - xfs_trans_agbtree_delta(cur->bc_tp, -1); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_ROOTS | XFS_AGF_LEVELS); - /* - * Update the cursor so there's one fewer level. - */ - xfs_btree_setbuf(cur, level, NULL); - cur->bc_nlevels--; - } else if (level > 0 && - (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * If we deleted the leftmost entry in the block, update the - * key values above us in the tree. - */ - if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1))) - return error; - /* - * If the number of records remaining in the block is at least - * the minimum, we're done. - */ - if (be16_to_cpu(block->bb_numrecs) >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * Otherwise, we have to move some records around to keep the - * tree balanced. Look at the left and right sibling blocks to - * see if we can re-balance by moving only one record. - */ - rbno = be32_to_cpu(block->bb_rightsib); - lbno = be32_to_cpu(block->bb_leftsib); - bno = NULLAGBLOCK; - ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); - /* - * Duplicate the cursor so our btree manipulations here won't - * disrupt the next level up. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - /* - * If there's a right sibling, see if it's ok to shift an entry - * out of it. - */ - if (rbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the last entry in the next block. - * Actually any entry but the first would suffice. - */ - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_increment(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * Grab a pointer to the block. - */ - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(right->bb_leftsib); - /* - * If right block is full enough so that removing one entry - * won't make it too empty, and left-shifting an entry out - * of right to us works, we're done. - */ - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_alloc_lshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level > 0 && - (error = xfs_alloc_decrement(cur, level, - &i))) - return error; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference, and fix up the temp cursor to point - * to our block again (last record). - */ - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLAGBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_decrement(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } - } - /* - * If there's a left sibling, see if it's ok to shift an entry - * out of it. - */ - if (lbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the first entry in the - * previous block. - */ - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_decrement(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_btree_firstrec(tcur, level); - /* - * Grab a pointer to the block. - */ - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(left->bb_rightsib); - /* - * If left block is full enough so that removing one entry - * won't make it too empty, and right-shifting an entry out - * of left to us works, we're done. - */ - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_alloc_rshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_ALLOC_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level == 0) - cur->bc_ptrs[0]++; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference. - */ - lrecs = be16_to_cpu(left->bb_numrecs); - } - /* - * Delete the temp cursor, we're done with it. - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - /* - * If here, we need to do a join to keep the tree balanced. - */ - ASSERT(bno != NULLAGBLOCK); - /* - * See if we can join with the left neighbor block. - */ - if (lbno != NULLAGBLOCK && - lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * Set "right" to be the starting block, - * "left" to be the left neighbor. - */ - rbno = bno; - right = block; - rbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, lbno, 0, &lbp, - XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - } - /* - * If that won't work, see if we can join with the right neighbor block. - */ - else if (rbno != NULLAGBLOCK && - rrecs + be16_to_cpu(block->bb_numrecs) <= - XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * Set "left" to be the starting block, - * "right" to be the right neighbor. - */ - lbno = bno; - left = block; - lbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, rbno, 0, &rbp, - XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - } - /* - * Otherwise, we can't fix the imbalance. - * Just return. This is probably a logic error, but it's not fatal. - */ - else { - if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * We're now going to join "left" and "right" by moving all the stuff - * in "right" to "left" and deleting "right". - */ - if (level > 0) { - /* - * It's a non-leaf. Move keys and pointers. - */ - lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memcpy(lkp, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*lkp)); - memcpy(lpp, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*lpp)); - xfs_alloc_log_keys(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); - } else { - /* - * It's a leaf. Move records. - */ - lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs) + 1, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*lrp)); - xfs_alloc_log_recs(cur, lbp, be16_to_cpu(left->bb_numrecs) + 1, - be16_to_cpu(left->bb_numrecs) + - be16_to_cpu(right->bb_numrecs)); - } - /* - * If we joined with the left neighbor, set the buffer in the - * cursor to the left block, and fix up the index. - */ - if (bp != lbp) { - xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += be16_to_cpu(left->bb_numrecs); - } - /* - * If we joined with the right neighbor and there's a level above - * us, increment the cursor at that level. - */ - else if (level + 1 < cur->bc_nlevels && - (error = xfs_alloc_increment(cur, level + 1, &i))) - return error; - /* - * Fix up the number of records in the surviving block. - */ - be16_add(&left->bb_numrecs, be16_to_cpu(right->bb_numrecs)); - /* - * Fix up the right block pointer in the surviving block, and log it. - */ - left->bb_rightsib = right->bb_rightsib; - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there is a right sibling now, make it point to the - * remaining block. - */ - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - xfs_alloc_block_t *rrblock; - xfs_buf_t *rrbp; - - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0, - &rrbp, XFS_ALLOC_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(lbno); - xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * Free the deleting block by putting it on the freelist. - */ - if ((error = xfs_alloc_put_freelist(cur->bc_tp, cur->bc_private.a.agbp, - NULL, rbno))) - return error; - /* - * Since blocks move to the free list without the coordination - * used in xfs_bmap_finish, we can't allow block to be available - * for reallocation and non-transaction writing (user data) - * until we know that the transaction that moved it to the free - * list is permanently on disk. We track the blocks by declaring - * these blocks as "busy"; the busy list is maintained on a - * per-ag basis and each transaction records which entries - * should be removed when the iclog commits to disk. If a - * busy block is allocated, the iclog is pushed up to the - * LSN that freed the block. - */ - xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); - xfs_trans_agbtree_delta(cur->bc_tp, -1); + ASSERT(ptr->s != 0); - /* - * Adjust the current level's cursor so that we're left referring - * to the right node, after we're done. - * If this leaves the ptr value 0 our caller will fix it up. - */ - if (level > 0) - cur->bc_ptrs[level]--; - /* - * Return value means the next level up has something to do. - */ - *stat = 2; - return 0; + agf->agf_roots[btnum] = ptr->s; + be32_add_cpu(&agf->agf_levels[btnum], inc); + pag->pagf_levels[btnum] += inc; + xfs_perag_put(pag); -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; + xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS); } -/* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. - */ -STATIC int /* error */ -xfs_alloc_insrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to insert record at */ - xfs_agblock_t *bnop, /* i/o: block number inserted */ - xfs_alloc_rec_t *recp, /* i/o: record data inserted */ - xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ - int *stat) /* output: success/failure */ +STATIC int +xfs_allocbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { - xfs_agf_t *agf; /* allocation group freelist header */ - xfs_alloc_block_t *block; /* btree block record/key lives in */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* key value being inserted */ - xfs_alloc_key_t *kp; /* pointer to btree keys */ - xfs_agblock_t nbno; /* block number of allocated block */ - xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ - xfs_alloc_key_t nkey; /* new key value, from split */ - xfs_alloc_rec_t nrec; /* new record value, for caller */ - int optr; /* old ptr value */ - xfs_alloc_ptr_t *pp; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_alloc_rec_t *rp; /* pointer to btree records */ - - ASSERT(be32_to_cpu(recp->ar_blockcount) > 0); + int error; + xfs_agblock_t bno; - /* - * GCC doesn't understand the (arguably complex) control flow in - * this function and complains about uninitialized structure fields - * without this. - */ - memset(&nrec, 0, sizeof(nrec)); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* - * If we made it to the root level, allocate a new root block - * and we're done. - */ - if (level >= cur->bc_nlevels) { - XFS_STATS_INC(xs_abt_insrec); - if ((error = xfs_alloc_newroot(cur, &i))) - return error; - *bnop = NULLAGBLOCK; - *stat = i; - return 0; + /* Allocate the new block from the freelist. If we can't, give up. */ + error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, + &bno, 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; } - /* - * Make a key out of the record data to be inserted, and save it. - */ - key.ar_startblock = recp->ar_startblock; - key.ar_blockcount = recp->ar_blockcount; - optr = ptr = cur->bc_ptrs[level]; - /* - * If we're off the left edge, return failure. - */ - if (ptr == 0) { + + if (bno == NULLAGBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } - XFS_STATS_INC(xs_abt_insrec); - /* - * Get pointers to the btree buffer and block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; - /* - * Check that the new entry is being inserted in the right place. - */ - if (ptr <= be16_to_cpu(block->bb_numrecs)) { - if (level == 0) { - rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); - xfs_btree_check_rec(cur->bc_btnum, recp, rp); - } else { - kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); - xfs_btree_check_key(cur->bc_btnum, &key, kp); - } - } -#endif - nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; - /* - * If the block is full, we can't insert the new entry until we - * make the block un-full. - */ - if (be16_to_cpu(block->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - /* - * First, try shifting an entry to the right neighbor. - */ - if ((error = xfs_alloc_rshift(cur, level, &i))) - return error; - if (i) { - /* nothing */ - } - /* - * Next, try shifting an entry to the left neighbor. - */ - else { - if ((error = xfs_alloc_lshift(cur, level, &i))) - return error; - if (i) - optr = ptr = cur->bc_ptrs[level]; - else { - /* - * Next, try splitting the current block in - * half. If this works we have to re-set our - * variables because we could be in a - * different block now. - */ - if ((error = xfs_alloc_split(cur, level, &nbno, - &nkey, &ncur, &i))) - return error; - if (i) { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = - xfs_btree_check_sblock(cur, - block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - nrec.ar_startblock = nkey.ar_startblock; - nrec.ar_blockcount = nkey.ar_blockcount; - } - /* - * Otherwise the insert fails. - */ - else { - *stat = 0; - return 0; - } - } - } - } - /* - * At this point we know there's room for our new entry in the block - * we're pointing at. - */ - if (level > 0) { - /* - * It's a non-leaf entry. Make a hole for the new data - * in the key and ptr regions of the block. - */ - kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(block->bb_numrecs); i >= ptr; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) - return error; - } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, *bnop, level))) - return error; -#endif - /* - * Now stuff the new data in, bump numrecs and log the new data. - */ - kp[ptr - 1] = key; - pp[ptr - 1] = cpu_to_be32(*bnop); - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_keys(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); - xfs_alloc_log_ptrs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); -#ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) - xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, - kp + ptr); -#endif - } else { - /* - * It's a leaf entry. Make a hole for the new record. - */ - rp = XFS_ALLOC_REC_ADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (be16_to_cpu(block->bb_numrecs) - ptr + 1) * sizeof(*rp)); - /* - * Now stuff the new record in, bump numrecs - * and log the new data. - */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ - be16_add(&block->bb_numrecs, 1); - xfs_alloc_log_recs(cur, bp, ptr, be16_to_cpu(block->bb_numrecs)); -#ifdef DEBUG - if (ptr < be16_to_cpu(block->bb_numrecs)) - xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, - rp + ptr); -#endif - } - /* - * Log the new number of records in the btree header. - */ - xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * If we inserted at the start of a block, update the parents' keys. - */ - if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1))) - return error; - /* - * Look to see if the longest extent in the allocation group - * needs to be updated. - */ - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - if (level == 0 && - cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) { - /* - * If this is a leaf in the by-size btree and there - * is no right sibling block and this block is bigger - * than the previous longest block, update it. - */ - agf->agf_longest = recp->ar_blockcount; - cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest - = be32_to_cpu(recp->ar_blockcount); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); - } - /* - * Return the new block number, if any. - * If there is one, give back a record value and a cursor too. - */ - *bnop = nbno; - if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ - *curp = ncur; /* INT_: struct copy */ - } + xfs_extent_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + + xfs_trans_agbtree_delta(cur->bc_tp, 1); + new->s = cpu_to_be32(bno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; } -/* - * Log header fields from a btree block. - */ -STATIC void -xfs_alloc_log_block( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ +STATIC int +xfs_allocbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) { - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - static const short offsets[] = { /* table of offsets */ - offsetof(xfs_alloc_block_t, bb_magic), - offsetof(xfs_alloc_block_t, bb_level), - offsetof(xfs_alloc_block_t, bb_numrecs), - offsetof(xfs_alloc_block_t, bb_leftsib), - offsetof(xfs_alloc_block_t, bb_rightsib), - sizeof(xfs_alloc_block_t) - }; - - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); - xfs_trans_log_buf(tp, bp, first, last); + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + xfs_agblock_t bno; + int error; + + bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); + if (error) + return error; + + xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + xfs_trans_agbtree_delta(cur->bc_tp, -1); + + xfs_trans_binval(cur->bc_tp, bp); + return 0; } /* - * Log keys from a btree block (nonleaf). + * Update the longest extent in the AGF */ STATIC void -xfs_alloc_log_keys( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int kfirst, /* index of first key to log */ - int klast) /* index of last key to log */ +xfs_allocbt_update_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, + int reason) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - xfs_alloc_key_t *kp; /* key pointer in btree block */ - int last; /* last byte offset logged */ - - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - kp = XFS_ALLOC_KEY_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); + xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno); + struct xfs_perag *pag; + __be32 len; + int numrecs; + + ASSERT(cur->bc_btnum == XFS_BTNUM_CNT); + + switch (reason) { + case LASTREC_UPDATE: + /* + * If this is the last leaf block and it's the last record, + * then update the size of the longest extent in the AG. + */ + if (ptr != xfs_btree_get_numrecs(block)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_INSREC: + if (be32_to_cpu(rec->alloc.ar_blockcount) <= + be32_to_cpu(agf->agf_longest)) + return; + len = rec->alloc.ar_blockcount; + break; + case LASTREC_DELREC: + numrecs = xfs_btree_get_numrecs(block); + if (ptr <= numrecs) + return; + ASSERT(ptr == numrecs + 1); + + if (numrecs) { + xfs_alloc_rec_t *rrp; + + rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs); + len = rrp->ar_blockcount; + } else { + len = 0; + } + + break; + default: + ASSERT(0); + return; + } + + agf->agf_longest = len; + pag = xfs_perag_get(cur->bc_mp, seqno); + pag->pagf_longest = be32_to_cpu(len); + xfs_perag_put(pag); + xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST); } -/* - * Log block pointer fields from a btree block (nonleaf). - */ -STATIC void -xfs_alloc_log_ptrs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int pfirst, /* index of first pointer to log */ - int plast) /* index of last pointer to log */ +STATIC int +xfs_allocbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */ - - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - pp = XFS_ALLOC_PTR_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + return cur->bc_mp->m_alloc_mnr[level != 0]; } -/* - * Log records from a btree block (leaf). - */ -STATIC void -xfs_alloc_log_recs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int rfirst, /* index of first record to log */ - int rlast) /* index of last record to log */ +STATIC int +xfs_allocbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_alloc_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_alloc_rec_t *rp; /* record pointer for btree block */ - - - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - rp = XFS_ALLOC_REC_ADDR(block, 1, cur); -#ifdef DEBUG - { - xfs_agf_t *agf; - xfs_alloc_rec_t *p; - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++) - ASSERT(be32_to_cpu(p->ar_startblock) + - be32_to_cpu(p->ar_blockcount) <= - be32_to_cpu(agf->agf_length)); - } -#endif - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + return cur->bc_mp->m_alloc_mxr[level != 0]; } -/* - * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. - */ -STATIC int /* error */ -xfs_alloc_lookup( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_lookup_t dir, /* <=, ==, or >= */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_agblock_t agbno; /* a.g. relative btree block number */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_alloc_block_t *block=NULL; /* current btree block */ - int diff; /* difference for the current key */ - int error; /* error return value */ - int keyno=0; /* current key number */ - int level; /* level in the btree */ - xfs_mount_t *mp; /* file system mount point */ - - XFS_STATS_INC(xs_abt_lookup); - /* - * Get the allocation group header, and the root block number. - */ - mp = cur->bc_mp; + ASSERT(rec->alloc.ar_startblock != 0); - { - xfs_agf_t *agf; /* a.g. freespace header */ + key->alloc.ar_startblock = rec->alloc.ar_startblock; + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; +} - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - agno = be32_to_cpu(agf->agf_seqno); - agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]); - } - /* - * Iterate over each level in the btree, starting at the root. - * For each level above the leaves, find the key we need, based - * on the lookup record, then follow the corresponding block - * pointer down to the next level. - */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - xfs_buf_t *bp; /* buffer pointer for btree block */ - xfs_daddr_t d; /* disk address of btree block */ +STATIC void +xfs_allocbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->alloc.ar_startblock != 0); - /* - * Get the disk address we're looking for. - */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - /* - * If the old buffer at this level is for a different block, - * throw it away, otherwise just use it. - */ - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; - if (!bp) { - /* - * Need to get a new buffer. Read it, then - * set it in the cursor, releasing the old one. - */ - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno, - agbno, 0, &bp, XFS_ALLOC_BTREE_REF))) - return error; - xfs_btree_setbuf(cur, level, bp); - /* - * Point to the btree block, now that we have the buffer - */ - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, level, - bp))) - return error; - } else - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - /* - * If we already had a key match at a higher level, we know - * we need to use the first entry in this block. - */ - if (diff == 0) - keyno = 1; - /* - * Otherwise we need to search this block. Do a binary search. - */ - else { - int high; /* high entry number */ - xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */ - xfs_alloc_rec_t *krbase=NULL;/* base of records in block */ - int low; /* low entry number */ - - /* - * Get a pointer to keys or records. - */ - if (level > 0) - kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur); - else - krbase = XFS_ALLOC_REC_ADDR(block, 1, cur); - /* - * Set low and high entry numbers, 1-based. - */ - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - /* - * If the block is empty, the tree must - * be an empty leaf. - */ - ASSERT(level == 0 && cur->bc_nlevels == 1); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - *stat = 0; - return 0; - } - /* - * Binary search the block. - */ - while (low <= high) { - xfs_extlen_t blockcount; /* key value */ - xfs_agblock_t startblock; /* key value */ - - XFS_STATS_INC(xs_abt_compare); - /* - * keyno is average of low and high. - */ - keyno = (low + high) >> 1; - /* - * Get startblock & blockcount. - */ - if (level > 0) { - xfs_alloc_key_t *kkp; - - kkp = kkbase + keyno - 1; - startblock = be32_to_cpu(kkp->ar_startblock); - blockcount = be32_to_cpu(kkp->ar_blockcount); - } else { - xfs_alloc_rec_t *krp; - - krp = krbase + keyno - 1; - startblock = be32_to_cpu(krp->ar_startblock); - blockcount = be32_to_cpu(krp->ar_blockcount); - } - /* - * Compute difference to get next direction. - */ - if (cur->bc_btnum == XFS_BTNUM_BNO) - diff = (int)startblock - - (int)cur->bc_rec.a.ar_startblock; - else if (!(diff = (int)blockcount - - (int)cur->bc_rec.a.ar_blockcount)) - diff = (int)startblock - - (int)cur->bc_rec.a.ar_startblock; - /* - * Less than, move right. - */ - if (diff < 0) - low = keyno + 1; - /* - * Greater than, move left. - */ - else if (diff > 0) - high = keyno - 1; - /* - * Equal, we're done. - */ - else - break; - } - } - /* - * If there are more levels, set up for the next level - * by getting the block number and filling in the cursor. - */ - if (level > 0) { - /* - * If we moved left, need the previous key number, - * unless there isn't one. - */ - if (diff > 0 && --keyno < 1) - keyno = 1; - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, agbno, level))) - return error; -#endif - cur->bc_ptrs[level] = keyno; - } - } - /* - * Done with the search. - * See if we need to adjust the results. - */ - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && - keyno > be16_to_cpu(block->bb_numrecs) && - be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - int i; - - cur->bc_ptrs[0] = keyno; - if ((error = xfs_alloc_increment(cur, 0, &i))) - return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - /* - * Return if we succeeded or not. - */ - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) - *stat = 0; - else - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - return 0; + rec->alloc.ar_startblock = key->alloc.ar_startblock; + rec->alloc.ar_blockcount = key->alloc.ar_blockcount; } -/* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_alloc_lshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) { - int error; /* error return value */ -#ifdef DEBUG - int i; /* loop index */ -#endif - xfs_alloc_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left neighbor block */ - xfs_alloc_block_t *left; /* left neighbor btree block */ - int nrec; /* new number of left block entries */ - xfs_buf_t *rbp; /* buffer for right (current) block */ - xfs_alloc_block_t *right; /* right (current) btree block */ - xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */ - xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */ - xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */ + ASSERT(cur->bc_rec.a.ar_startblock != 0); - /* - * Set up variables for this block as "right". - */ - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; -#endif - /* - * If we've got no left sibling then we can't shift an entry left. - */ - if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] <= 1) { - *stat = 0; - return 0; - } - /* - * Set up the left neighbor as "left". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib), - 0, &lbp, XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - nrec = be16_to_cpu(left->bb_numrecs) + 1; - /* - * If non-leaf, copy a key and a ptr to the left block. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* key pointer for left block */ - xfs_alloc_ptr_t *lpp; /* address pointer for left block */ - - lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - *lkp = *rkp; - xfs_alloc_log_keys(cur, lbp, nrec, nrec); - lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) - return error; -#endif - *lpp = *rpp; /* INT_: copy */ - xfs_alloc_log_ptrs(cur, lbp, nrec, nrec); - xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); - } - /* - * If leaf, copy a record to the left block. - */ - else { - xfs_alloc_rec_t *lrp; /* record pointer for left block */ - - lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - *lrp = *rrp; - xfs_alloc_log_recs(cur, lbp, nrec, nrec); - xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); - } - /* - * Bump and log left's numrecs, decrement and log right's numrecs. - */ - be16_add(&left->bb_numrecs, 1); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, -1); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Slide the contents of right down one entry. - */ - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]), - level))) - return error; - } -#endif - memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - } else { - memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ar_startblock = rrp->ar_startblock; - key.ar_blockcount = rrp->ar_blockcount; - rkp = &key; - } - /* - * Update the parent key values of right. - */ - if ((error = xfs_alloc_updkey(cur, rkp, level + 1))) - return error; - /* - * Slide the cursor value left one. - */ - cur->bc_ptrs[level]--; - *stat = 1; - return 0; + rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); + rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); } -/* - * Allocate a new root block, fill it in. - */ -STATIC int /* error */ -xfs_alloc_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC void +xfs_allocbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) { - int error; /* error return value */ - xfs_agblock_t lbno; /* left block number */ - xfs_buf_t *lbp; /* left btree buffer */ - xfs_alloc_block_t *left; /* left btree block */ - xfs_mount_t *mp; /* mount structure */ - xfs_agblock_t nbno; /* new block number */ - xfs_buf_t *nbp; /* new (root) buffer */ - xfs_alloc_block_t *new; /* new (root) btree block */ - int nptr; /* new value for key index, 1 or 2 */ - xfs_agblock_t rbno; /* right block number */ - xfs_buf_t *rbp; /* right btree buffer */ - xfs_alloc_block_t *right; /* right btree block */ - - mp = cur->bc_mp; - - ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp)); - /* - * Get a buffer from the freelist blocks, for the new root. - */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &nbno))) - return error; - /* - * None available, we fail. - */ - if (nbno == NULLAGBLOCK) { - *stat = 0; - return 0; - } - xfs_trans_agbtree_delta(cur->bc_tp, 1); - nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno, - 0); - new = XFS_BUF_TO_ALLOC_BLOCK(nbp); - /* - * Set the root data in the a.g. freespace structure. - */ - { - xfs_agf_t *agf; /* a.g. freespace header */ - xfs_agnumber_t seqno; - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno); - be32_add(&agf->agf_levels[cur->bc_btnum], 1); - seqno = be32_to_cpu(agf->agf_seqno); - mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++; - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_ROOTS | XFS_AGF_LEVELS); - } - /* - * At the previous root level there are now two blocks: the old - * root, and the new block generated when it was split. - * We don't know which one the cursor is pointing at, so we - * set up variables "left" and "right" for each case. - */ - lbp = cur->bc_bufs[cur->bc_nlevels - 1]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp))) - return error; -#endif - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - /* - * Our block is left, pick up the right block. - */ - lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp)); - rbno = be32_to_cpu(left->bb_rightsib); - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, rbno, 0, &rbp, - XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, - cur->bc_nlevels - 1, rbp))) - return error; - nptr = 1; - } else { - /* - * Our block is right, pick up the left block. - */ - rbp = lbp; - right = left; - rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp)); - lbno = be32_to_cpu(right->bb_leftsib); - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.a.agno, lbno, 0, &lbp, - XFS_ALLOC_BTREE_REF))) - return error; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, - cur->bc_nlevels - 1, lbp))) - return error; - nptr = 2; - } - /* - * Fill in the new block's btree header and log it. - */ - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - new->bb_level = cpu_to_be16(cur->bc_nlevels); - new->bb_numrecs = cpu_to_be16(2); - new->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS); - ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); - /* - * Fill in the key data in the new root. - */ - { - xfs_alloc_key_t *kp; /* btree key pointer */ + struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - kp = XFS_ALLOC_KEY_ADDR(new, 1, cur); - if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur); /* INT_: structure copy */ - kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);/* INT_: structure copy */ - } else { - xfs_alloc_rec_t *rp; /* btree record pointer */ - - rp = XFS_ALLOC_REC_ADDR(left, 1, cur); - kp[0].ar_startblock = rp->ar_startblock; - kp[0].ar_blockcount = rp->ar_blockcount; - rp = XFS_ALLOC_REC_ADDR(right, 1, cur); - kp[1].ar_startblock = rp->ar_startblock; - kp[1].ar_blockcount = rp->ar_blockcount; - } - } - xfs_alloc_log_keys(cur, nbp, 1, 2); - /* - * Fill in the pointer data in the new root. - */ - { - xfs_alloc_ptr_t *pp; /* btree address pointer */ + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno)); + ASSERT(agf->agf_roots[cur->bc_btnum] != 0); - pp = XFS_ALLOC_PTR_ADDR(new, 1, cur); - pp[0] = cpu_to_be32(lbno); - pp[1] = cpu_to_be32(rbno); - } - xfs_alloc_log_ptrs(cur, nbp, 1, 2); - /* - * Fix up the cursor. - */ - xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; - cur->bc_nlevels++; - *stat = 1; - return 0; + ptr->s = agf->agf_roots[cur->bc_btnum]; } -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_alloc_rshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC __int64_t +xfs_allocbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) { - int error; /* error return value */ - int i; /* loop index */ - xfs_alloc_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left (current) block */ - xfs_alloc_block_t *left; /* left (current) btree block */ - xfs_buf_t *rbp; /* buffer for right neighbor block */ - xfs_alloc_block_t *right; /* right neighbor btree block */ - xfs_alloc_key_t *rkp; /* key pointer for right block */ - xfs_btree_cur_t *tcur; /* temporary cursor */ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; - /* - * Set up variables for this block as "left". - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * If we've got no right sibling then we can't shift an entry right. - */ - if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - *stat = 0; - return 0; + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return (__int64_t)be32_to_cpu(kp->ar_startblock) - + rec->ar_startblock; } - /* - * Set up the right neighbor as "right". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), - 0, &rbp, XFS_ALLOC_BTREE_REF))) - return error; - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - /* - * Make a hole at the start of the right neighbor block, then - * copy the last left block entry to the hole. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* key pointer for left block */ - xfs_alloc_ptr_t *lpp; /* address pointer for left block */ - xfs_alloc_ptr_t *rpp; /* address pointer for right block */ - - lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) - return error; -#endif - *rkp = *lkp; /* INT_: copy */ - *rpp = *lpp; /* INT_: copy */ - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); - } else { - xfs_alloc_rec_t *lrp; /* record pointer for left block */ - xfs_alloc_rec_t *rrp; /* record pointer for right block */ - - lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ar_startblock = rrp->ar_startblock; - key.ar_blockcount = rrp->ar_blockcount; - rkp = &key; - xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); - } - /* - * Decrement and log left's numrecs, bump and log right's numrecs. - */ - be16_add(&left->bb_numrecs, -1); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Using a temporary cursor, update the parent key values of the - * block on the right. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_alloc_increment(tcur, level, &i)) || - (error = xfs_alloc_updkey(tcur, rkp, level + 1))) - goto error0; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - *stat = 1; - return 0; -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; -} -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_alloc_split( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to split */ - xfs_agblock_t *bnop, /* output: block number allocated */ - xfs_alloc_key_t *keyp, /* output: first key of new block */ - xfs_btree_cur_t **curp, /* output: new cursor */ - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* loop index/record number */ - xfs_agblock_t lbno; /* left (current) block number */ - xfs_buf_t *lbp; /* buffer for left block */ - xfs_alloc_block_t *left; /* left (current) btree block */ - xfs_agblock_t rbno; /* right (new) block number */ - xfs_buf_t *rbp; /* buffer for right block */ - xfs_alloc_block_t *right; /* right (new) btree block */ + diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; + if (diff) + return diff; - /* - * Allocate the new block from the freelist. - * If we can't do it, we're toast. Give up. - */ - if ((error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp, - &rbno))) - return error; - if (rbno == NULLAGBLOCK) { - *stat = 0; - return 0; - } - xfs_trans_agbtree_delta(cur->bc_tp, 1); - rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno, - rbno, 0); - /* - * Set up the new block as "right". - */ - right = XFS_BUF_TO_ALLOC_BLOCK(rbp); - /* - * "Left" is the current (according to the cursor) block. - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_ALLOC_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * Fill in the btree header for the new block. - */ - right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - /* - * Make sure that if there's an odd number of entries now, that - * each new block will have the same number of entries. - */ - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - /* - * For non-leaf blocks, copy keys and addresses over to the new block. - */ - if (level > 0) { - xfs_alloc_key_t *lkp; /* left btree key pointer */ - xfs_alloc_ptr_t *lpp; /* left btree address pointer */ - xfs_alloc_key_t *rkp; /* right btree key pointer */ - xfs_alloc_ptr_t *rpp; /* right btree address pointer */ - - lkp = XFS_ALLOC_KEY_ADDR(left, i, cur); - lpp = XFS_ALLOC_PTR_ADDR(left, i, cur); - rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur); - rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *keyp = *rkp; - } - /* - * For leaf blocks, copy records over to the new block. - */ - else { - xfs_alloc_rec_t *lrp; /* left btree record pointer */ - xfs_alloc_rec_t *rrp; /* right btree record pointer */ - - lrp = XFS_ALLOC_REC_ADDR(left, i, cur); - rrp = XFS_ALLOC_REC_ADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ar_startblock = rrp->ar_startblock; - keyp->ar_blockcount = rrp->ar_blockcount; - } - /* - * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. - */ - lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp)); - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be32(rbno); - right->bb_leftsib = cpu_to_be32(lbno); - xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS); - xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there's a block to the new block's right, make that block - * point back to right instead of to left. - */ - if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) { - xfs_alloc_block_t *rrblock; /* rr btree block */ - xfs_buf_t *rrbp; /* buffer for rrblock */ - - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0, - &rrbp, XFS_ALLOC_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(rbno); - xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * If the cursor is really in the right block, move it there. - * If it's just pointing past the last entry in left, then we'll - * insert there, so don't change anything in that case. - */ - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - /* - * If there are more levels, we'll need another cursor which refers to - * the right block, no matter where this cursor was. - */ - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) - return error; - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = rbno; - *stat = 1; - return 0; + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } -/* - * Update keys at all levels from here to the root along the cursor's path. - */ -STATIC int /* error */ -xfs_alloc_updkey( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_alloc_key_t *keyp, /* new key value to update to */ - int level) /* starting level for update */ +static bool +xfs_allocbt_verify( + struct xfs_buf *bp) { - int ptr; /* index of key in block */ - - /* - * Go up the tree from this level toward the root. - * At each level, update the key value to the value input. - * Stop when we reach a level where the cursor isn't pointing - * at the first entry in the block. - */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - xfs_alloc_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer for block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - xfs_alloc_key_t *kp; /* ptr to btree block keys */ - - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur); - *kp = *keyp; - xfs_alloc_log_keys(cur, bp, ptr, ptr); - } - return 0; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * magic number and level verification + * + * During growfs operations, we can't verify the exact level or owner as + * the perag is not fully initialised and hence not attached to the + * buffer. In this case, check against the maximum tree depth. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agf information will not yet have been initialised + * from the on disk AGF. Again, we can only check against maximum limits + * in this case. + */ + level = be16_to_cpu(block->bb_level); + switch (block->bb_magic) { + case cpu_to_be32(XFS_ABTB_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTB_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_BNOi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + case cpu_to_be32(XFS_ABTC_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_ABTC_MAGIC): + if (pag && pag->pagf_init) { + if (level >= pag->pagf_levels[XFS_BTNUM_CNTi]) + return false; + } else if (level >= mp->m_ag_maxlevels) + return false; + break; + default: + return false; + } + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } -/* - * Externally visible routines. - */ - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_alloc_decrement( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +static void +xfs_allocbt_read_verify( + struct xfs_buf *bp) { - xfs_alloc_block_t *block; /* btree block */ - int error; /* error return value */ - int lev; /* btree level */ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_allocbt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the left at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - /* - * Decrement the ptr at this level. If we're still in the block - * then we're done. - */ - if (--cur->bc_ptrs[level] > 0) { - *stat = 1; - return 0; - } - /* - * Get a pointer to the btree block. - */ - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, - cur->bc_bufs[level]))) - return error; -#endif - /* - * If we just went off the left edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree decrementing pointers. - * Stop when we don't go off the left edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - /* - * Read-ahead the left block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - xfs_buf_t *bp; /* buffer pointer for block */ - - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_ALLOC_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } - *stat = 1; - return 0; } -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -int /* error */ -xfs_alloc_delete( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +static void +xfs_allocbt_write_verify( + struct xfs_buf *bp) { - int error; /* error return value */ - int i; /* result code */ - int level; /* btree level */ - - /* - * Go up the tree, starting at leaf level. - * If 2 is returned then a join was done; go to the next level. - * Otherwise we are done. - */ - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_alloc_delrec(cur, level, &i))) - return error; + if (!xfs_allocbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_alloc_decrement(cur, level, &i))) - return error; - break; - } - } - } - *stat = i; - return 0; + xfs_btree_sblock_calc_crc(bp); + } -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_alloc_get_rec( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t *bno, /* output: starting block of extent */ - xfs_extlen_t *len, /* output: length of extent */ - int *stat) /* output: success/failure */ -{ - xfs_alloc_block_t *block; /* btree block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - int ptr; /* record number */ +const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .verify_read = xfs_allocbt_read_verify, + .verify_write = xfs_allocbt_write_verify, +}; - ptr = cur->bc_ptrs[0]; - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) - return error; -#endif - /* - * Off the right end or left end, return failure. - */ - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - /* - * Point to the record and extract its data. - */ - { - xfs_alloc_rec_t *rec; /* record data */ - rec = XFS_ALLOC_REC_ADDR(block, ptr, cur); - *bno = be32_to_cpu(rec->ar_startblock); - *len = be32_to_cpu(rec->ar_blockcount); +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_allocbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); + } else { + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); } - *stat = 1; - return 0; } -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_alloc_increment( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +STATIC int +xfs_allocbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) { - xfs_alloc_block_t *block; /* btree block */ - xfs_buf_t *bp; /* tree block buffer */ - int error; /* error return value */ - int lev; /* btree level */ - - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the right at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - /* - * Get a pointer to the btree block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Increment the ptr at this level. If we're still in the block - * then we're done. - */ - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - *stat = 1; - return 0; - } - /* - * If we just went off the right edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree incrementing pointers. - * Stop when we don't go off the right edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - bp = cur->bc_bufs[lev]; - block = XFS_BUF_TO_ALLOC_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; -#endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - /* - * Read-ahead the right block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp); - lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - - agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.a.agno, agbno, 0, &bp, - XFS_ALLOC_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_ALLOC_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = 1; + if (cur->bc_btnum == XFS_BTNUM_BNO) { + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); + } else { + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); } - *stat = 1; - return 0; } +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_allocbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_rec_from_key = xfs_allocbt_init_rec_from_key, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_allocbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_allocbt_keys_inorder, + .recs_inorder = xfs_allocbt_recs_inorder, +#endif +}; /* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. + * Allocate a new allocation btree cursor. */ -int /* error */ -xfs_alloc_insert( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +struct xfs_btree_cur * /* new alloc btree cursor */ +xfs_allocbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agf structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* btree identifier */ { - int error; /* error return value */ - int i; /* result value, 0 for failure */ - int level; /* current level number in btree */ - xfs_agblock_t nbno; /* new block number (split result) */ - xfs_btree_cur_t *ncur; /* new cursor (split result) */ - xfs_alloc_rec_t nrec; /* record being inserted this level */ - xfs_btree_cur_t *pcur; /* previous level's cursor */ - - level = 0; - nbno = NULLAGBLOCK; - nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); - nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); - ncur = (xfs_btree_cur_t *)0; - pcur = cur; - /* - * Loop going up the tree, starting at the leaf level. - * Stop when we don't get a split block, that must mean that - * the insert is finished with this level. - */ - do { - /* - * Insert nrec/nbno into this level of the tree. - * Note if we fail, nbno will be null. - */ - if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - return error; - } - /* - * See if the cursor we just used is trash. - * Can't trash the caller's cursor, but otherwise we should - * if ncur is a new cursor or we're about to be done. - */ - if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - /* - * If we got a new cursor, switch to it. - */ - if (ncur) { - pcur = ncur; - ncur = (xfs_btree_cur_t *)0; - } - } while (nbno != NULLAGBLOCK); - *stat = i; - return 0; -} + struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp); + struct xfs_btree_cur *cur; -/* - * Lookup the record equal to [bno, len] in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_eq( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat); -} + ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); -/* - * Lookup the first record greater than or equal to [bno, len] - * in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_ge( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat); -} + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); -/* - * Lookup the first record less than or equal to [bno, len] - * in the btree given by cur. - */ -int /* error */ -xfs_alloc_lookup_le( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* success/failure */ -{ - cur->bc_rec.a.ar_startblock = bno; - cur->bc_rec.a.ar_blockcount = len; - return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat); + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_ops = &xfs_allocbt_ops; + + if (btnum == XFS_BTNUM_CNT) { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); + cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; + } else { + cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); + } + + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + return cur; } /* - * Update the record referred to by cur, to the value given by [bno, len]. - * This either works (return 0) or gets an EFSCORRUPTED error. + * Calculate number of records in an alloc btree block. */ -int /* error */ -xfs_alloc_update( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t bno, /* starting block of extent */ - xfs_extlen_t len) /* length of extent */ +int +xfs_allocbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) { - xfs_alloc_block_t *block; /* btree block to update */ - int error; /* error return value */ - int ptr; /* current record number (updating) */ + blocklen -= XFS_ALLOC_BLOCK_LEN(mp); - ASSERT(len > 0); - /* - * Pick up the a.g. freelist struct and the current block. - */ - block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0]))) - return error; -#endif - /* - * Get the address of the rec to be updated. - */ - ptr = cur->bc_ptrs[0]; - { - xfs_alloc_rec_t *rp; /* pointer to updated record */ - - rp = XFS_ALLOC_REC_ADDR(block, ptr, cur); - /* - * Fill in the new contents and log them. - */ - rp->ar_startblock = cpu_to_be32(bno); - rp->ar_blockcount = cpu_to_be32(len); - xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr); - } - /* - * If it's the by-size btree and it's the last leaf block and - * it's the last record... then update the size of the longest - * extent in the a.g., which we cache in the a.g. freelist header. - */ - if (cur->bc_btnum == XFS_BTNUM_CNT && - be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK && - ptr == be16_to_cpu(block->bb_numrecs)) { - xfs_agf_t *agf; /* a.g. freespace header */ - xfs_agnumber_t seqno; - - agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp); - seqno = be32_to_cpu(agf->agf_seqno); - cur->bc_mp->m_perag[seqno].pagf_longest = len; - agf->agf_longest = cpu_to_be32(len); - xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, - XFS_AGF_LONGEST); - } - /* - * Updating first record in leaf. Pass new key value up to our parent. - */ - if (ptr == 1) { - xfs_alloc_key_t key; /* key containing [bno, len] */ - - key.ar_startblock = cpu_to_be32(bno); - key.ar_blockcount = cpu_to_be32(len); - if ((error = xfs_alloc_updkey(cur, &key, 1))) - return error; - } - return 0; + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); } diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h index bce81c7a4fd..45e189e7e81 100644 --- a/fs/xfs/xfs_alloc_btree.h +++ b/fs/xfs/xfs_alloc_btree.h @@ -24,136 +24,42 @@ struct xfs_buf; struct xfs_btree_cur; -struct xfs_btree_sblock; struct xfs_mount; /* - * There are two on-disk btrees, one sorted by blockno and one sorted - * by blockcount and blockno. All blocks look the same to make the code - * simpler; if we have time later, we'll make the optimizations. + * Btree block header size depends on a superblock flag. */ -#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ -#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ - -/* - * Data record/key structure - */ -typedef struct xfs_alloc_rec { - __be32 ar_startblock; /* starting block number */ - __be32 ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_t, xfs_alloc_key_t; - -typedef struct xfs_alloc_rec_incore { - xfs_agblock_t ar_startblock; /* starting block number */ - xfs_extlen_t ar_blockcount; /* count of free blocks */ -} xfs_alloc_rec_incore_t; - -/* btree pointer type */ -typedef __be32 xfs_alloc_ptr_t; -/* btree block header type */ -typedef struct xfs_btree_sblock xfs_alloc_block_t; - -#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)XFS_BUF_PTR(bp)) - -/* - * Real block structures have a size equal to the disk block size. - */ -#define XFS_ALLOC_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) -#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0]) -#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0]) - -/* - * Minimum and maximum blocksize and sectorsize. - * The blocksize upper limit is pretty much arbitrary. - * The sectorsize upper limit is due to sizeof(sb_sectsize). - */ -#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ -#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) -#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) -#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ -#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ -#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) -#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) - -/* - * Block numbers in the AG: - * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. - */ -#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) -#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) +#define XFS_ALLOC_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. - */ -#define XFS_ALLOC_REC_ADDR(bb,i,cur) \ - XFS_BTREE_REC_ADDR(XFS_ALLOC_BLOCK_SIZE(0,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(0, cur)) - -#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \ - XFS_BTREE_KEY_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) - -#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \ - XFS_BTREE_PTR_ADDR(XFS_ALLOC_BLOCK_SIZE(1,cur), xfs_alloc, \ - bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur)) - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat); - -/* - * Get the data from the pointed-to record. - */ -extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno, - xfs_extlen_t *len, int *stat); - -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat); - -/* - * Lookup the record equal to [bno, len] in the btree given by cur. - */ -extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); - -/* - * Lookup the first record greater than or equal to [bno, len] - * in the btree given by cur. - */ -extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); - -/* - * Lookup the first record less than or equal to [bno, len] - * in the btree given by cur. - */ -extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len, int *stat); - -/* - * Update the record referred to by cur, to the value given by [bno, len]. - * This either works (return 0) or gets an EFSCORRUPTED error. - */ -extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno, - xfs_extlen_t len); + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_ALLOC_REC_ADDR(mp, block, index) \ + ((xfs_alloc_rec_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_alloc_rec_t)))) + +#define XFS_ALLOC_KEY_ADDR(mp, block, index) \ + ((xfs_alloc_key_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_alloc_key_t))) + +#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_alloc_ptr_t *) \ + ((char *)(block) + \ + XFS_ALLOC_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_alloc_key_t) + \ + ((index) - 1) * sizeof(xfs_alloc_ptr_t))) + +extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, + xfs_agnumber_t, xfs_btnum_t); +extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c new file mode 100644 index 00000000000..faaf716e208 --- /dev/null +++ b/fs/xfs/xfs_aops.c @@ -0,0 +1,1770 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include <linux/aio.h> +#include <linux/gfp.h> +#include <linux/mpage.h> +#include <linux/pagevec.h> +#include <linux/writeback.h> + +void +xfs_count_page_state( + struct page *page, + int *delalloc, + int *unwritten) +{ + struct buffer_head *bh, *head; + + *delalloc = *unwritten = 0; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) + (*unwritten) = 1; + else if (buffer_delay(bh)) + (*delalloc) = 1; + } while ((bh = bh->b_this_page) != head); +} + +STATIC struct block_device * +xfs_find_bdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_bdev; + else + return mp->m_ddev_targp->bt_bdev; +} + +/* + * We're now finished for good with this ioend structure. + * Update the page state via the associated buffer_heads, + * release holds on the inode and bio, and finally free + * up memory. Do not use the ioend after this. + */ +STATIC void +xfs_destroy_ioend( + xfs_ioend_t *ioend) +{ + struct buffer_head *bh, *next; + + for (bh = ioend->io_buffer_head; bh; bh = next) { + next = bh->b_private; + bh->b_end_io(bh, !ioend->io_error); + } + + mempool_free(ioend, xfs_ioend_pool); +} + +/* + * Fast and loose check if this write could update the on-disk inode size. + */ +static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend) +{ + return ioend->io_offset + ioend->io_size > + XFS_I(ioend->io_inode)->i_d.di_size; +} + +STATIC int +xfs_setfilesize_trans_alloc( + struct xfs_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + ioend->io_append_trans = tp; + + /* + * We may pass freeze protection with a transaction. So tell lockdep + * we released it. + */ + rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 1, _THIS_IP_); + /* + * We hand off the transaction to the completion thread now, so + * clear the flag here. + */ + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + return 0; +} + +/* + * Update on-disk file size now that data has been written to disk. + */ +STATIC int +xfs_setfilesize( + struct xfs_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_trans *tp = ioend->io_append_trans; + xfs_fsize_t isize; + + /* + * The transaction may have been allocated in the I/O submission thread, + * thus we need to mark ourselves as beeing in a transaction manually. + * Similarly for freeze protection. + */ + current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], + 0, 1, _THIS_IP_); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); + if (!isize) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp, 0); + return 0; + } + + trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size); + + ip->i_d.di_size = isize; + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + return xfs_trans_commit(tp, 0); +} + +/* + * Schedule IO completion handling on the final put of an ioend. + * + * If there is no work to do we might as well call it a day and free the + * ioend right now. + */ +STATIC void +xfs_finish_ioend( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) { + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + + if (ioend->io_type == XFS_IO_UNWRITTEN) + queue_work(mp->m_unwritten_workqueue, &ioend->io_work); + else if (ioend->io_append_trans || + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) + queue_work(mp->m_data_workqueue, &ioend->io_work); + else + xfs_destroy_ioend(ioend); + } +} + +/* + * IO write completion. + */ +STATIC void +xfs_end_io( + struct work_struct *work) +{ + xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work); + struct xfs_inode *ip = XFS_I(ioend->io_inode); + int error = 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + ioend->io_error = -EIO; + goto done; + } + if (ioend->io_error) + goto done; + + /* + * For unwritten extents we need to issue transactions to convert a + * range to normal written extens after the data I/O has finished. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) { + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, + ioend->io_size); + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { + /* + * For direct I/O we do not know if we need to allocate blocks + * or not so we can't preallocate an append transaction as that + * results in nested reservations and log space deadlocks. Hence + * allocate the transaction here. While this is sub-optimal and + * can block IO completion for some time, we're stuck with doing + * it this way until we can pass the ioend to the direct IO + * allocation callbacks and avoid nesting that way. + */ + error = xfs_setfilesize_trans_alloc(ioend); + if (error) + goto done; + error = xfs_setfilesize(ioend); + } else if (ioend->io_append_trans) { + error = xfs_setfilesize(ioend); + } else { + ASSERT(!xfs_ioend_is_append(ioend)); + } + +done: + if (error) + ioend->io_error = -error; + xfs_destroy_ioend(ioend); +} + +/* + * Call IO completion handling in caller context on the final put of an ioend. + */ +STATIC void +xfs_finish_ioend_sync( + struct xfs_ioend *ioend) +{ + if (atomic_dec_and_test(&ioend->io_remaining)) + xfs_end_io(&ioend->io_work); +} + +/* + * Allocate and initialise an IO completion structure. + * We need to track unwritten extent write completion here initially. + * We'll need to extend this for updating the ondisk inode size later + * (vs. incore size). + */ +STATIC xfs_ioend_t * +xfs_alloc_ioend( + struct inode *inode, + unsigned int type) +{ + xfs_ioend_t *ioend; + + ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); + + /* + * Set the count to 1 initially, which will prevent an I/O + * completion callback from happening before we have started + * all the I/O from calling the completion routine too early. + */ + atomic_set(&ioend->io_remaining, 1); + ioend->io_isdirect = 0; + ioend->io_error = 0; + ioend->io_list = NULL; + ioend->io_type = type; + ioend->io_inode = inode; + ioend->io_buffer_head = NULL; + ioend->io_buffer_tail = NULL; + ioend->io_offset = 0; + ioend->io_size = 0; + ioend->io_append_trans = NULL; + + INIT_WORK(&ioend->io_work, xfs_end_io); + return ioend; +} + +STATIC int +xfs_map_blocks( + struct inode *inode, + loff_t offset, + struct xfs_bmbt_irec *imap, + int type, + int nonblocking) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t count = 1 << inode->i_blkbits; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int bmapi_flags = XFS_BMAPI_ENTIRE; + int nimaps = 1; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (type == XFS_IO_UNWRITTEN) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { + if (nonblocking) + return -XFS_ERROR(EAGAIN); + xfs_ilock(ip, XFS_ILOCK_SHARED); + } + + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + (ip->i_df.if_flags & XFS_IFEXTENTS)); + ASSERT(offset <= mp->m_super->s_maxbytes); + + if (offset + count > mp->m_super->s_maxbytes) + count = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + imap, &nimaps, bmapi_flags); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (error) + return -XFS_ERROR(error); + + if (type == XFS_IO_DELALLOC && + (!nimaps || isnullstartblock(imap->br_startblock))) { + error = xfs_iomap_write_allocate(ip, offset, imap); + if (!error) + trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); + return -XFS_ERROR(error); + } + +#ifdef DEBUG + if (type == XFS_IO_UNWRITTEN) { + ASSERT(nimaps); + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + } +#endif + if (nimaps) + trace_xfs_map_blocks_found(ip, offset, count, type, imap); + return 0; +} + +STATIC int +xfs_imap_valid( + struct inode *inode, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + offset >>= inode->i_blkbits; + + return offset >= imap->br_startoff && + offset < imap->br_startoff + imap->br_blockcount; +} + +/* + * BIO completion handler for buffered IO. + */ +STATIC void +xfs_end_bio( + struct bio *bio, + int error) +{ + xfs_ioend_t *ioend = bio->bi_private; + + ASSERT(atomic_read(&bio->bi_cnt) >= 1); + ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; + + /* Toss bio and pass work off to an xfsdatad thread */ + bio->bi_private = NULL; + bio->bi_end_io = NULL; + bio_put(bio); + + xfs_finish_ioend(ioend); +} + +STATIC void +xfs_submit_ioend_bio( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + struct bio *bio) +{ + atomic_inc(&ioend->io_remaining); + bio->bi_private = ioend; + bio->bi_end_io = xfs_end_bio; + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); +} + +STATIC struct bio * +xfs_alloc_ioend_bio( + struct buffer_head *bh) +{ + int nvecs = bio_get_nr_vecs(bh->b_bdev); + struct bio *bio = bio_alloc(GFP_NOIO, nvecs); + + ASSERT(bio->bi_private == NULL); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + return bio; +} + +STATIC void +xfs_start_buffer_writeback( + struct buffer_head *bh) +{ + ASSERT(buffer_mapped(bh)); + ASSERT(buffer_locked(bh)); + ASSERT(!buffer_delay(bh)); + ASSERT(!buffer_unwritten(bh)); + + mark_buffer_async_write(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); +} + +STATIC void +xfs_start_page_writeback( + struct page *page, + int clear_dirty, + int buffers) +{ + ASSERT(PageLocked(page)); + ASSERT(!PageWriteback(page)); + if (clear_dirty) + clear_page_dirty_for_io(page); + set_page_writeback(page); + unlock_page(page); + /* If no buffers on the page are to be written, finish it here */ + if (!buffers) + end_page_writeback(page); +} + +static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh) +{ + return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); +} + +/* + * Submit all of the bios for all of the ioends we have saved up, covering the + * initial writepage page and also any probed pages. + * + * Because we may have multiple ioends spanning a page, we need to start + * writeback on all the buffers before we submit them for I/O. If we mark the + * buffers as we got, then we can end up with a page that only has buffers + * marked async write and I/O complete on can occur before we mark the other + * buffers async write. + * + * The end result of this is that we trip a bug in end_page_writeback() because + * we call it twice for the one page as the code in end_buffer_async_write() + * assumes that all buffers on the page are started at the same time. + * + * The fix is two passes across the ioend list - one to start writeback on the + * buffer_heads, and then submit them for I/O on the second pass. + * + * If @fail is non-zero, it means that we have a situation where some part of + * the submission process has failed after we have marked paged for writeback + * and unlocked them. In this situation, we need to fail the ioend chain rather + * than submit it to IO. This typically only happens on a filesystem shutdown. + */ +STATIC void +xfs_submit_ioend( + struct writeback_control *wbc, + xfs_ioend_t *ioend, + int fail) +{ + xfs_ioend_t *head = ioend; + xfs_ioend_t *next; + struct buffer_head *bh; + struct bio *bio; + sector_t lastblock = 0; + + /* Pass 1 - start writeback */ + do { + next = ioend->io_list; + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) + xfs_start_buffer_writeback(bh); + } while ((ioend = next) != NULL); + + /* Pass 2 - submit I/O */ + ioend = head; + do { + next = ioend->io_list; + bio = NULL; + + /* + * If we are failing the IO now, just mark the ioend with an + * error and finish it. This will run IO completion immediately + * as there is only one reference to the ioend at this point in + * time. + */ + if (fail) { + ioend->io_error = -fail; + xfs_finish_ioend(ioend); + continue; + } + + for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) { + + if (!bio) { + retry: + bio = xfs_alloc_ioend_bio(bh); + } else if (bh->b_blocknr != lastblock + 1) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + if (xfs_bio_add_buffer(bio, bh) != bh->b_size) { + xfs_submit_ioend_bio(wbc, ioend, bio); + goto retry; + } + + lastblock = bh->b_blocknr; + } + if (bio) + xfs_submit_ioend_bio(wbc, ioend, bio); + xfs_finish_ioend(ioend); + } while ((ioend = next) != NULL); +} + +/* + * Cancel submission of all buffer_heads so far in this endio. + * Toss the endio too. Only ever called for the initial page + * in a writepage request, so only ever one page. + */ +STATIC void +xfs_cancel_ioend( + xfs_ioend_t *ioend) +{ + xfs_ioend_t *next; + struct buffer_head *bh, *next_bh; + + do { + next = ioend->io_list; + bh = ioend->io_buffer_head; + do { + next_bh = bh->b_private; + clear_buffer_async_write(bh); + unlock_buffer(bh); + } while ((bh = next_bh) != NULL); + + mempool_free(ioend, xfs_ioend_pool); + } while ((ioend = next) != NULL); +} + +/* + * Test to see if we've been building up a completion structure for + * earlier buffers -- if so, we try to append to this ioend if we + * can, otherwise we finish off any current ioend and start another. + * Return true if we've finished the given ioend. + */ +STATIC void +xfs_add_to_ioend( + struct inode *inode, + struct buffer_head *bh, + xfs_off_t offset, + unsigned int type, + xfs_ioend_t **result, + int need_ioend) +{ + xfs_ioend_t *ioend = *result; + + if (!ioend || need_ioend || type != ioend->io_type) { + xfs_ioend_t *previous = *result; + + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_buffer_head = bh; + ioend->io_buffer_tail = bh; + if (previous) + previous->io_list = ioend; + *result = ioend; + } else { + ioend->io_buffer_tail->b_private = bh; + ioend->io_buffer_tail = bh; + } + + bh->b_private = NULL; + ioend->io_size += bh->b_size; +} + +STATIC void +xfs_map_buffer( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + sector_t bn; + struct xfs_mount *m = XFS_I(inode)->i_mount; + xfs_off_t iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff); + xfs_daddr_t iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock); + + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) + + ((offset - iomap_offset) >> inode->i_blkbits); + + ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode))); + + bh->b_blocknr = bn; + set_buffer_mapped(bh); +} + +STATIC void +xfs_map_at_offset( + struct inode *inode, + struct buffer_head *bh, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + ASSERT(imap->br_startblock != HOLESTARTBLOCK); + ASSERT(imap->br_startblock != DELAYSTARTBLOCK); + + xfs_map_buffer(inode, bh, imap, offset); + set_buffer_mapped(bh); + clear_buffer_delay(bh); + clear_buffer_unwritten(bh); +} + +/* + * Test if a given page contains at least one buffer of a given @type. + * If @check_all_buffers is true, then we walk all the buffers in the page to + * try to find one of the type passed in. If it is not set, then the caller only + * needs to check the first buffer on the page for a match. + */ +STATIC bool +xfs_check_page_type( + struct page *page, + unsigned int type, + bool check_all_buffers) +{ + struct buffer_head *bh; + struct buffer_head *head; + + if (PageWriteback(page)) + return false; + if (!page->mapping) + return false; + if (!page_has_buffers(page)) + return false; + + bh = head = page_buffers(page); + do { + if (buffer_unwritten(bh)) { + if (type == XFS_IO_UNWRITTEN) + return true; + } else if (buffer_delay(bh)) { + if (type == XFS_IO_DELALLOC) + return true; + } else if (buffer_dirty(bh) && buffer_mapped(bh)) { + if (type == XFS_IO_OVERWRITE) + return true; + } + + /* If we are only checking the first buffer, we are done now. */ + if (!check_all_buffers) + break; + } while ((bh = bh->b_this_page) != head); + + return false; +} + +/* + * Allocate & map buffers for page given the extent map. Write it out. + * except for the original page of a writepage, this is called on + * delalloc/unwritten pages only, for the original page it is possible + * that the page has no mapping at all. + */ +STATIC int +xfs_convert_page( + struct inode *inode, + struct page *page, + loff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + xfs_off_t end_offset; + unsigned long p_offset; + unsigned int type; + int len, page_dirty; + int count = 0, done = 0, uptodate = 1; + xfs_off_t offset = page_offset(page); + + if (page->index != tindex) + goto fail; + if (!trylock_page(page)) + goto fail; + if (PageWriteback(page)) + goto fail_unlock_page; + if (page->mapping != inode->i_mapping) + goto fail_unlock_page; + if (!xfs_check_page_type(page, (*ioendp)->io_type, false)) + goto fail_unlock_page; + + /* + * page_dirty is initially a count of buffers on the page before + * EOF and is decremented as we move each into a cleanable state. + * + * Derivation: + * + * End offset is the highest offset that this page should represent. + * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) + * will evaluate non-zero and be less than PAGE_CACHE_SIZE and + * hence give us the correct page_dirty count. On any other page, + * it will be zero and in that case we need page_dirty to be the + * count of buffers on the page. + */ + end_offset = min_t(unsigned long long, + (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, + i_size_read(inode)); + + /* + * If the current map does not span the entire page we are about to try + * to write, then give up. The only way we can write a page that spans + * multiple mappings in a single writeback iteration is via the + * xfs_vm_writepage() function. Data integrity writeback requires the + * entire page to be written in a single attempt, otherwise the part of + * the page we don't write here doesn't get written as part of the data + * integrity sync. + * + * For normal writeback, we also don't attempt to write partial pages + * here as it simply means that write_cache_pages() will see it under + * writeback and ignore the page until some point in the future, at + * which time this will be the only page in the file that needs + * writeback. Hence for more optimal IO patterns, we should always + * avoid partial page writeback due to multiple mappings on a page here. + */ + if (!xfs_imap_valid(inode, imap, end_offset)) + goto fail_unlock_page; + + len = 1 << inode->i_blkbits; + p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), + PAGE_CACHE_SIZE); + p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; + page_dirty = p_offset / len; + + /* + * The moment we find a buffer that doesn't match our current type + * specification or can't be written, abort the loop and start + * writeback. As per the above xfs_imap_valid() check, only + * xfs_vm_writepage() can handle partial page writeback fully - we are + * limited here to the buffers that are contiguous with the current + * ioend, and hence a buffer we can't write breaks that contiguity and + * we have to defer the rest of the IO to xfs_vm_writepage(). + */ + bh = head = page_buffers(page); + do { + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + if (!(PageUptodate(page) || buffer_uptodate(bh))) { + done = 1; + break; + } + + if (buffer_unwritten(bh) || buffer_delay(bh) || + buffer_mapped(bh)) { + if (buffer_unwritten(bh)) + type = XFS_IO_UNWRITTEN; + else if (buffer_delay(bh)) + type = XFS_IO_DELALLOC; + else + type = XFS_IO_OVERWRITE; + + /* + * imap should always be valid because of the above + * partial page end_offset check on the imap. + */ + ASSERT(xfs_imap_valid(inode, imap, offset)); + + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, + ioendp, done); + + page_dirty--; + count++; + } else { + done = 1; + break; + } + } while (offset += len, (bh = bh->b_this_page) != head); + + if (uptodate && bh == head) + SetPageUptodate(page); + + if (count) { + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) + done = 1; + } + xfs_start_page_writeback(page, !page_dirty, count); + + return done; + fail_unlock_page: + unlock_page(page); + fail: + return 1; +} + +/* + * Convert & write out a cluster of pages in the same extent as defined + * by mp and following the start page. + */ +STATIC void +xfs_cluster_write( + struct inode *inode, + pgoff_t tindex, + struct xfs_bmbt_irec *imap, + xfs_ioend_t **ioendp, + struct writeback_control *wbc, + pgoff_t tlast) +{ + struct pagevec pvec; + int done = 0, i; + + pagevec_init(&pvec, 0); + while (!done && tindex <= tlast) { + unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); + + if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + done = xfs_convert_page(inode, pvec.pages[i], tindex++, + imap, ioendp, wbc); + if (done) + break; + } + + pagevec_release(&pvec); + cond_resched(); + } +} + +STATIC void +xfs_vm_invalidatepage( + struct page *page, + unsigned int offset, + unsigned int length) +{ + trace_xfs_invalidatepage(page->mapping->host, page, offset, + length); + block_invalidatepage(page, offset, length); +} + +/* + * If the page has delalloc buffers on it, we need to punch them out before we + * invalidate the page. If we don't, we leave a stale delalloc mapping on the + * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read + * is done on that same region - the delalloc extent is returned when none is + * supposed to be there. + * + * We prevent this by truncating away the delalloc regions on the page before + * invalidating it. Because they are delalloc, we can do this without needing a + * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this + * truncation without a transaction as there is no space left for block + * reservation (typically why we see a ENOSPC in writeback). + * + * This is not a performance critical path, so for now just do the punching a + * buffer head at a time. + */ +STATIC void +xfs_aops_discard_page( + struct page *page) +{ + struct inode *inode = page->mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct buffer_head *bh, *head; + loff_t offset = page_offset(page); + + if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true)) + goto out_invalidate; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_invalidate; + + xfs_alert(ip->i_mount, + "page discard on page %p, inode 0x%llx, offset %llu.", + page, ip->i_ino, offset); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + bh = head = page_buffers(page); + do { + int error; + xfs_fileoff_t start_fsb; + + if (!buffer_delay(bh)) + goto next_buffer; + + start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "page discard unable to remove delalloc mapping."); + } + break; + } +next_buffer: + offset += 1 << inode->i_blkbits; + + } while ((bh = bh->b_this_page) != head); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_invalidate: + xfs_vm_invalidatepage(page, 0, PAGE_CACHE_SIZE); + return; +} + +/* + * Write out a dirty page. + * + * For delalloc space on the page we need to allocate space and flush it. + * For unwritten space on the page we need to start the conversion to + * regular allocated space. + * For any other dirty buffer heads on the page we should flush them. + */ +STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *bh, *head; + struct xfs_bmbt_irec imap; + xfs_ioend_t *ioend = NULL, *iohead = NULL; + loff_t offset; + unsigned int type; + __uint64_t end_offset; + pgoff_t end_index, last_index; + ssize_t len; + int err, imap_valid = 0, uptodate = 1; + int count = 0; + int nonblocking = 0; + + trace_xfs_writepage(inode, page, 0, 0); + + ASSERT(page_has_buffers(page)); + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should never happen except in the case of a VM regression so + * warn about it. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == + PF_MEMALLOC)) + goto redirty; + + /* + * Given that we do not allow direct reclaim to call us, we should + * never be called while in a filesystem transaction. + */ + if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + goto redirty; + + /* Is this page beyond the end of the file? */ + offset = i_size_read(inode); + end_index = offset >> PAGE_CACHE_SHIFT; + last_index = (offset - 1) >> PAGE_CACHE_SHIFT; + + /* + * The page index is less than the end_index, adjust the end_offset + * to the highest offset that this page should represent. + * ----------------------------------------------------- + * | file mapping | <EOF> | + * ----------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | | + * ^--------------------------------^----------|-------- + * | desired writeback range | see else | + * ---------------------------------^------------------| + */ + if (page->index < end_index) + end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT; + else { + /* + * Check whether the page to write out is beyond or straddles + * i_size or not. + * ------------------------------------------------------- + * | file mapping | <EOF> | + * ------------------------------------------------------- + * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | + * ^--------------------------------^-----------|--------- + * | | Straddles | + * ---------------------------------^-----------|--------| + */ + unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1); + + /* + * Skip the page if it is fully outside i_size, e.g. due to a + * truncate operation that is in progress. We must redirty the + * page so that reclaim stops reclaiming it. Otherwise + * xfs_vm_releasepage() is called on it and gets confused. + * + * Note that the end_index is unsigned long, it would overflow + * if the given offset is greater than 16TB on 32-bit system + * and if we do check the page is fully outside i_size or not + * via "if (page->index >= end_index + 1)" as "end_index + 1" + * will be evaluated to 0. Hence this page will be redirtied + * and be written out repeatedly which would result in an + * infinite loop, the user program that perform this operation + * will hang. Instead, we can verify this situation by checking + * if the page to write is totally beyond the i_size or if it's + * offset is just equal to the EOF. + */ + if (page->index > end_index || + (page->index == end_index && offset_into_page == 0)) + goto redirty; + + /* + * The page straddles i_size. It must be zeroed out on each + * and every writepage invocation because it may be mmapped. + * "A file is mapped in multiples of the page size. For a file + * that is not a multiple of the page size, the remaining + * memory is zeroed when mapped, and writes to that region are + * not written out to the file." + */ + zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE); + + /* Adjust the end_offset to the end of file */ + end_offset = offset; + } + + len = 1 << inode->i_blkbits; + + bh = head = page_buffers(page); + offset = page_offset(page); + type = XFS_IO_OVERWRITE; + + if (wbc->sync_mode == WB_SYNC_NONE) + nonblocking = 1; + + do { + int new_ioend = 0; + + if (offset >= end_offset) + break; + if (!buffer_uptodate(bh)) + uptodate = 0; + + /* + * set_page_dirty dirties all buffers in a page, independent + * of their state. The dirty state however is entirely + * meaningless for holes (!mapped && uptodate), so skip + * buffers covering holes here. + */ + if (!buffer_mapped(bh) && buffer_uptodate(bh)) { + imap_valid = 0; + continue; + } + + if (buffer_unwritten(bh)) { + if (type != XFS_IO_UNWRITTEN) { + type = XFS_IO_UNWRITTEN; + imap_valid = 0; + } + } else if (buffer_delay(bh)) { + if (type != XFS_IO_DELALLOC) { + type = XFS_IO_DELALLOC; + imap_valid = 0; + } + } else if (buffer_uptodate(bh)) { + if (type != XFS_IO_OVERWRITE) { + type = XFS_IO_OVERWRITE; + imap_valid = 0; + } + } else { + if (PageUptodate(page)) + ASSERT(buffer_mapped(bh)); + /* + * This buffer is not uptodate and will not be + * written to disk. Ensure that we will put any + * subsequent writeable buffers into a new + * ioend. + */ + imap_valid = 0; + continue; + } + + if (imap_valid) + imap_valid = xfs_imap_valid(inode, &imap, offset); + if (!imap_valid) { + /* + * If we didn't have a valid mapping then we need to + * put the new mapping into a separate ioend structure. + * This ensures non-contiguous extents always have + * separate ioends, which is particularly important + * for unwritten extent conversion at I/O completion + * time. + */ + new_ioend = 1; + err = xfs_map_blocks(inode, offset, &imap, type, + nonblocking); + if (err) + goto error; + imap_valid = xfs_imap_valid(inode, &imap, offset); + } + if (imap_valid) { + lock_buffer(bh); + if (type != XFS_IO_OVERWRITE) + xfs_map_at_offset(inode, bh, &imap, offset); + xfs_add_to_ioend(inode, bh, offset, type, &ioend, + new_ioend); + count++; + } + + if (!iohead) + iohead = ioend; + + } while (offset += len, ((bh = bh->b_this_page) != head)); + + if (uptodate && bh == head) + SetPageUptodate(page); + + xfs_start_page_writeback(page, 1, count); + + /* if there is no IO to be submitted for this page, we are done */ + if (!ioend) + return 0; + + ASSERT(iohead); + + /* + * Any errors from this point onwards need tobe reported through the IO + * completion path as we have marked the initial page as under writeback + * and unlocked it. + */ + if (imap_valid) { + xfs_off_t end_index; + + end_index = imap.br_startoff + imap.br_blockcount; + + /* to bytes */ + end_index <<= inode->i_blkbits; + + /* to pages */ + end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; + + /* check against file size */ + if (end_index > last_index) + end_index = last_index; + + xfs_cluster_write(inode, page->index + 1, &imap, &ioend, + wbc, end_index); + } + + + /* + * Reserve log space if we might write beyond the on-disk inode size. + */ + err = 0; + if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend)) + err = xfs_setfilesize_trans_alloc(ioend); + + xfs_submit_ioend(wbc, iohead, err); + + return 0; + +error: + if (iohead) + xfs_cancel_ioend(iohead); + + if (err == -EAGAIN) + goto redirty; + + xfs_aops_discard_page(page); + ClearPageUptodate(page); + unlock_page(page); + return err; + +redirty: + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; +} + +STATIC int +xfs_vm_writepages( + struct address_space *mapping, + struct writeback_control *wbc) +{ + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return generic_writepages(mapping, wbc); +} + +/* + * Called to move a page into cleanable state - and from there + * to be released. The page should already be clean. We always + * have buffer heads in this call. + * + * Returns 1 if the page is ok to release, 0 otherwise. + */ +STATIC int +xfs_vm_releasepage( + struct page *page, + gfp_t gfp_mask) +{ + int delalloc, unwritten; + + trace_xfs_releasepage(page->mapping->host, page, 0, 0); + + xfs_count_page_state(page, &delalloc, &unwritten); + + if (WARN_ON_ONCE(delalloc)) + return 0; + if (WARN_ON_ONCE(unwritten)) + return 0; + + return try_to_free_buffers(page); +} + +STATIC int +__xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create, + int direct) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb, end_fsb; + int error = 0; + int lockmode = 0; + struct xfs_bmbt_irec imap; + int nimaps = 1; + xfs_off_t offset; + ssize_t size; + int new = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + offset = (xfs_off_t)iblock << inode->i_blkbits; + ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + + /* + * Direct I/O is usually done on preallocated files, so try getting + * a block mapping without an exclusive lock first. For buffered + * writes we already have the exclusive iolock anyway, so avoiding + * a lock roundtrip here by taking the ilock exclusive from the + * beginning is a useful micro optimization. + */ + if (create && !direct) { + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); + } else { + lockmode = xfs_ilock_data_map_shared(ip); + } + + ASSERT(offset <= mp->m_super->s_maxbytes); + if (offset + size > mp->m_super->s_maxbytes) + size = mp->m_super->s_maxbytes - offset; + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + if (create && + (!nimaps || + (imap.br_startblock == HOLESTARTBLOCK || + imap.br_startblock == DELAYSTARTBLOCK))) { + if (direct || xfs_get_extsz_hint(ip)) { + /* + * Drop the ilock in preparation for starting the block + * allocation transaction. It will be retaken + * exclusively inside xfs_iomap_write_direct for the + * actual allocation. + */ + xfs_iunlock(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, size, + &imap, nimaps); + if (error) + return -error; + new = 1; + } else { + /* + * Delalloc reservations do not require a transaction, + * we can go on without dropping the lock here. If we + * are allocating a new delalloc block, make sure that + * we set the new flag so that we mark the buffer new so + * that we know that it is newly allocated if the write + * fails. + */ + if (nimaps && imap.br_startblock == HOLESTARTBLOCK) + new = 1; + error = xfs_iomap_write_delay(ip, offset, size, &imap); + if (error) + goto out_unlock; + + xfs_iunlock(ip, lockmode); + } + + trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + } else if (nimaps) { + trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + xfs_iunlock(ip, lockmode); + } else { + trace_xfs_get_blocks_notfound(ip, offset, size); + goto out_unlock; + } + + if (imap.br_startblock != HOLESTARTBLOCK && + imap.br_startblock != DELAYSTARTBLOCK) { + /* + * For unwritten extents do not report a disk address on + * the read case (treat as if we're reading into a hole). + */ + if (create || !ISUNWRITTEN(&imap)) + xfs_map_buffer(inode, bh_result, &imap, offset); + if (create && ISUNWRITTEN(&imap)) { + if (direct) { + bh_result->b_private = inode; + set_buffer_defer_completion(bh_result); + } + set_buffer_unwritten(bh_result); + } + } + + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. + */ + bh_result->b_bdev = xfs_find_bdev_for_inode(inode); + + /* + * If we previously allocated a block out beyond eof and we are now + * coming back to use it then we will need to flag it as new even if it + * has a disk address. + * + * With sub-block writes into unwritten extents we also need to mark + * the buffer as new so that the unwritten parts of the buffer gets + * correctly zeroed. + */ + if (create && + ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || + (offset >= i_size_read(inode)) || + (new || ISUNWRITTEN(&imap)))) + set_buffer_new(bh_result); + + if (imap.br_startblock == DELAYSTARTBLOCK) { + BUG_ON(direct); + if (create) { + set_buffer_uptodate(bh_result); + set_buffer_mapped(bh_result); + set_buffer_delay(bh_result); + } + } + + /* + * If this is O_DIRECT or the mpage code calling tell them how large + * the mapping is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the + * mapping for blocks beyond EOF must be marked new so that sub block + * regions can be correctly zeroed. We can't do this for mappings within + * EOF unless the mapping was just allocated or is unwritten, otherwise + * the callers would overwrite existing data with zeros. Hence we have + * to split the mapping into a range up to and including EOF, and a + * second mapping for beyond EOF. + */ + if (direct || size > (1 << inode->i_blkbits)) { + xfs_off_t mapping_size; + + mapping_size = imap.br_startoff + imap.br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; + } + + return 0; + +out_unlock: + xfs_iunlock(ip, lockmode); + return -error; +} + +int +xfs_get_blocks( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 0); +} + +STATIC int +xfs_get_blocks_direct( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, 1); +} + +/* + * Complete a direct I/O write request. + * + * If the private argument is non-NULL __xfs_get_blocks signals us that we + * need to issue a transaction to convert the range from unwritten to written + * extents. In case this is regular synchronous I/O we just call xfs_end_io + * to do this and we are done. But in case this was a successful AIO + * request this handler is called from interrupt context, from which we + * can't start transactions. In that case offload the I/O completion to + * the workqueues we also use for buffered I/O completion. + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct xfs_ioend *ioend = iocb->private; + + /* + * While the generic direct I/O code updates the inode size, it does + * so only after the end_io handler is called, which means our + * end_io handler thinks the on-disk size is outside the in-core + * size. To prevent this just update it a little bit earlier here. + */ + if (offset + size > i_size_read(ioend->io_inode)) + i_size_write(ioend->io_inode, offset + size); + + /* + * blockdev_direct_IO can return an error even after the I/O + * completion handler was called. Thus we need to protect + * against double-freeing. + */ + iocb->private = NULL; + + ioend->io_offset = offset; + ioend->io_size = size; + if (private && size > 0) + ioend->io_type = XFS_IO_UNWRITTEN; + + xfs_finish_ioend_sync(ioend); +} + +STATIC ssize_t +xfs_vm_direct_IO( + int rw, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset) +{ + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct block_device *bdev = xfs_find_bdev_for_inode(inode); + struct xfs_ioend *ioend = NULL; + ssize_t ret; + + if (rw & WRITE) { + size_t size = iov_iter_count(iter); + + /* + * We cannot preallocate a size update transaction here as we + * don't know whether allocation is necessary or not. Hence we + * can only tell IO completion that one is necessary if we are + * not doing unwritten extent conversion. + */ + iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); + if (offset + size > XFS_I(inode)->i_d.di_size) + ioend->io_isdirect = 1; + + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, + xfs_end_io_direct_write, NULL, + DIO_ASYNC_EXTEND); + if (ret != -EIOCBQUEUED && iocb->private) + goto out_destroy_ioend; + } else { + ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter, + offset, xfs_get_blocks_direct, + NULL, NULL, 0); + } + + return ret; + +out_destroy_ioend: + xfs_destroy_ioend(ioend); + return ret; +} + +/* + * Punch out the delalloc blocks we have already allocated. + * + * Don't bother with xfs_setattr given that nothing can have made it to disk yet + * as the page is still locked at this point. + */ +STATIC void +xfs_vm_kill_delalloc_range( + struct inode *inode, + loff_t start, + loff_t end) +{ + struct xfs_inode *ip = XFS_I(inode); + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error; + + start_fsb = XFS_B_TO_FSB(ip->i_mount, start); + end_fsb = XFS_B_TO_FSB(ip->i_mount, end); + if (end_fsb <= start_fsb) + return; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "xfs_vm_write_failed: unable to clean up ino %lld", + ip->i_ino); + } + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); +} + +STATIC void +xfs_vm_write_failed( + struct inode *inode, + struct page *page, + loff_t pos, + unsigned len) +{ + loff_t block_offset; + loff_t block_start; + loff_t block_end; + loff_t from = pos & (PAGE_CACHE_SIZE - 1); + loff_t to = from + len; + struct buffer_head *bh, *head; + + /* + * The request pos offset might be 32 or 64 bit, this is all fine + * on 64-bit platform. However, for 64-bit pos request on 32-bit + * platform, the high 32-bit will be masked off if we evaluate the + * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is + * 0xfffff000 as an unsigned long, hence the result is incorrect + * which could cause the following ASSERT failed in most cases. + * In order to avoid this, we can evaluate the block_offset of the + * start of the page by using shifts rather than masks the mismatch + * problem. + */ + block_offset = (pos >> PAGE_CACHE_SHIFT) << PAGE_CACHE_SHIFT; + + ASSERT(block_offset + from == pos); + + head = page_buffers(page); + block_start = 0; + for (bh = head; bh != head || !block_start; + bh = bh->b_this_page, block_start = block_end, + block_offset += bh->b_size) { + block_end = block_start + bh->b_size; + + /* skip buffers before the write */ + if (block_end <= from) + continue; + + /* if the buffer is after the write, we're done */ + if (block_start >= to) + break; + + if (!buffer_delay(bh)) + continue; + + if (!buffer_new(bh) && block_offset < i_size_read(inode)) + continue; + + xfs_vm_kill_delalloc_range(inode, block_offset, + block_offset + bh->b_size); + + /* + * This buffer does not contain data anymore. make sure anyone + * who finds it knows that for certain. + */ + clear_buffer_delay(bh); + clear_buffer_uptodate(bh); + clear_buffer_mapped(bh); + clear_buffer_new(bh); + clear_buffer_dirty(bh); + } + +} + +/* + * This used to call block_write_begin(), but it unlocks and releases the page + * on error, and we need that page to be able to punch stale delalloc blocks out + * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at + * the appropriate point. + */ +STATIC int +xfs_vm_write_begin( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; + int status; + + ASSERT(len <= PAGE_CACHE_SIZE); + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + status = __block_write_begin(page, pos, len, xfs_get_blocks); + if (unlikely(status)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + + xfs_vm_write_failed(inode, page, pos, len); + unlock_page(page); + + /* + * If the write is beyond EOF, we only want to kill blocks + * allocated in this write, not blocks that were previously + * written successfully. + */ + if (pos + len > isize) { + ssize_t start = max_t(ssize_t, pos, isize); + + truncate_pagecache_range(inode, start, pos + len); + } + + page_cache_release(page); + page = NULL; + } + + *pagep = page; + return status; +} + +/* + * On failure, we only need to kill delalloc blocks beyond EOF in the range of + * this specific write because they will never be written. Previous writes + * beyond EOF where block allocation succeeded do not need to be trashed, so + * only new blocks from this write should be trashed. For blocks within + * EOF, generic_write_end() zeros them so they are safe to leave alone and be + * written with all the other valid data. + */ +STATIC int +xfs_vm_write_end( + struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned copied, + struct page *page, + void *fsdata) +{ + int ret; + + ASSERT(len <= PAGE_CACHE_SIZE); + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (unlikely(ret < len)) { + struct inode *inode = mapping->host; + size_t isize = i_size_read(inode); + loff_t to = pos + len; + + if (to > isize) { + /* only kill blocks in this write beyond EOF */ + if (pos > isize) + isize = pos; + xfs_vm_kill_delalloc_range(inode, isize, to); + truncate_pagecache_range(inode, isize, to); + } + } + return ret; +} + +STATIC sector_t +xfs_vm_bmap( + struct address_space *mapping, + sector_t block) +{ + struct inode *inode = (struct inode *)mapping->host; + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_vm_bmap(XFS_I(inode)); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + filemap_write_and_wait(mapping); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return generic_block_bmap(mapping, block, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpage( + struct file *unused, + struct page *page) +{ + return mpage_readpage(page, xfs_get_blocks); +} + +STATIC int +xfs_vm_readpages( + struct file *unused, + struct address_space *mapping, + struct list_head *pages, + unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); +} + +const struct address_space_operations xfs_address_space_operations = { + .readpage = xfs_vm_readpage, + .readpages = xfs_vm_readpages, + .writepage = xfs_vm_writepage, + .writepages = xfs_vm_writepages, + .releasepage = xfs_vm_releasepage, + .invalidatepage = xfs_vm_invalidatepage, + .write_begin = xfs_vm_write_begin, + .write_end = xfs_vm_write_end, + .bmap = xfs_vm_bmap, + .direct_IO = xfs_vm_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/xfs_aops.h index 2244e516b66..f94dd459dff 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -18,10 +18,23 @@ #ifndef __XFS_AOPS_H__ #define __XFS_AOPS_H__ -extern struct workqueue_struct *xfsdatad_workqueue; extern mempool_t *xfs_ioend_pool; -typedef void (*xfs_ioend_func_t)(void *); +/* + * Types of I/O for bmap clustering and I/O completion tracking. + */ +enum { + XFS_IO_DIRECT = 0, /* special case for direct I/O ioends */ + XFS_IO_DELALLOC, /* covers delalloc region */ + XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ + XFS_IO_OVERWRITE, /* covers already allocated extent */ +}; + +#define XFS_IO_TYPES \ + { 0, "" }, \ + { XFS_IO_DELALLOC, "delalloc" }, \ + { XFS_IO_UNWRITTEN, "unwritten" }, \ + { XFS_IO_OVERWRITE, "overwrite" } /* * xfs_ioend struct manages large extent writes for XFS. @@ -32,15 +45,19 @@ typedef struct xfs_ioend { unsigned int io_type; /* delalloc / unwritten */ int io_error; /* I/O error code */ atomic_t io_remaining; /* hold count */ - struct bhv_vnode *io_vnode; /* file being written to */ + unsigned int io_isdirect : 1;/* direct I/O */ + struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct xfs_trans *io_append_trans;/* xact. for size update */ } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); +extern void xfs_count_page_state(struct page *, int *, int *); + #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h deleted file mode 100644 index c4836890b72..00000000000 --- a/fs/xfs/xfs_arch.h +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_ARCH_H__ -#define __XFS_ARCH_H__ - -#ifndef XFS_BIG_INUMS -# error XFS_BIG_INUMS must be defined true or false -#endif - -#ifdef __KERNEL__ - -#include <asm/byteorder.h> - -#ifdef __BIG_ENDIAN -#define XFS_NATIVE_HOST 1 -#else -#undef XFS_NATIVE_HOST -#endif - -#else /* __KERNEL__ */ - -#if __BYTE_ORDER == __BIG_ENDIAN -#define XFS_NATIVE_HOST 1 -#else -#undef XFS_NATIVE_HOST -#endif - -#ifdef XFS_NATIVE_HOST -#define cpu_to_be16(val) ((__be16)(val)) -#define cpu_to_be32(val) ((__be32)(val)) -#define cpu_to_be64(val) ((__be64)(val)) -#define be16_to_cpu(val) ((__uint16_t)(val)) -#define be32_to_cpu(val) ((__uint32_t)(val)) -#define be64_to_cpu(val) ((__uint64_t)(val)) -#else -#define cpu_to_be16(val) (__swab16((__uint16_t)(val))) -#define cpu_to_be32(val) (__swab32((__uint32_t)(val))) -#define cpu_to_be64(val) (__swab64((__uint64_t)(val))) -#define be16_to_cpu(val) (__swab16((__be16)(val))) -#define be32_to_cpu(val) (__swab32((__be32)(val))) -#define be64_to_cpu(val) (__swab64((__be64)(val))) -#endif - -#endif /* __KERNEL__ */ - -/* do we need conversion? */ -#define ARCH_NOCONVERT 1 -#ifdef XFS_NATIVE_HOST -# define ARCH_CONVERT ARCH_NOCONVERT -#else -# define ARCH_CONVERT 0 -#endif - -/* generic swapping macros */ - -#ifndef HAVE_SWABMACROS -#define INT_SWAP16(type,var) ((typeof(type))(__swab16((__u16)(var)))) -#define INT_SWAP32(type,var) ((typeof(type))(__swab32((__u32)(var)))) -#define INT_SWAP64(type,var) ((typeof(type))(__swab64((__u64)(var)))) -#endif - -#define INT_SWAP(type, var) \ - ((sizeof(type) == 8) ? INT_SWAP64(type,var) : \ - ((sizeof(type) == 4) ? INT_SWAP32(type,var) : \ - ((sizeof(type) == 2) ? INT_SWAP16(type,var) : \ - (var)))) - -/* - * get and set integers from potentially unaligned locations - */ - -#define INT_GET_UNALIGNED_16_BE(pointer) \ - ((__u16)((((__u8*)(pointer))[0] << 8) | (((__u8*)(pointer))[1]))) -#define INT_SET_UNALIGNED_16_BE(pointer,value) \ - { \ - ((__u8*)(pointer))[0] = (((value) >> 8) & 0xff); \ - ((__u8*)(pointer))[1] = (((value) ) & 0xff); \ - } - -/* define generic INT_ macros */ - -#define INT_GET(reference,arch) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (reference) \ - : \ - INT_SWAP((reference),(reference)) \ - ) - -/* does not return a value */ -#define INT_SET(reference,arch,valueref) \ - (__builtin_constant_p(valueref) ? \ - (void)( (reference) = ( ((arch) != ARCH_NOCONVERT) ? (INT_SWAP((reference),(valueref))) : (valueref)) ) : \ - (void)( \ - ((reference) = (valueref)), \ - ( ((arch) != ARCH_NOCONVERT) ? (reference) = INT_SWAP((reference),(reference)) : 0 ) \ - ) \ - ) - -/* does not return a value */ -#define INT_MOD_EXPR(reference,arch,code) \ - (((arch) == ARCH_NOCONVERT) \ - ? \ - (void)((reference) code) \ - : \ - (void)( \ - (reference) = INT_GET((reference),arch) , \ - ((reference) code), \ - INT_SET(reference, arch, reference) \ - ) \ - ) - -/* does not return a value */ -#define INT_MOD(reference,arch,delta) \ - (void)( \ - INT_MOD_EXPR(reference,arch,+=(delta)) \ - ) - -/* - * INT_COPY - copy a value between two locations with the - * _same architecture_ but _potentially different sizes_ - * - * if the types of the two parameters are equal or they are - * in native architecture, a simple copy is done - * - * otherwise, architecture conversions are done - * - */ - -/* does not return a value */ -#define INT_COPY(dst,src,arch) \ - ( \ - ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \ - ? \ - (void)((dst) = (src)) \ - : \ - INT_SET(dst, arch, INT_GET(src, arch)) \ - ) - -/* - * INT_XLATE - copy a value in either direction between two locations - * with different architectures - * - * dir < 0 - copy from memory to buffer (native to arch) - * dir > 0 - copy from buffer to memory (arch to native) - */ - -/* does not return a value */ -#define INT_XLATE(buf,mem,dir,arch) {\ - ASSERT(dir); \ - if (dir>0) { \ - (mem)=INT_GET(buf, arch); \ - } else { \ - INT_SET(buf, arch, mem); \ - } \ -} - -static inline void be16_add(__be16 *a, __s16 b) -{ - *a = cpu_to_be16(be16_to_cpu(*a) + b); -} - -static inline void be32_add(__be32 *a, __s32 b) -{ - *a = cpu_to_be32(be32_to_cpu(*a) + b); -} - -static inline void be64_add(__be64 *a, __s64 b) -{ - *a = cpu_to_be64(be64_to_cpu(*a) + b); -} - -/* - * In directories inode numbers are stored as unaligned arrays of unsigned - * 8bit integers on disk. - * - * For v1 directories or v2 directories that contain inode numbers that - * do not fit into 32bit the array has eight members, but the first member - * is always zero: - * - * |unused|48-55|40-47|32-39|24-31|16-23| 8-15| 0- 7| - * - * For v2 directories that only contain entries with inode numbers that fit - * into 32bits a four-member array is used: - * - * |24-31|16-23| 8-15| 0- 7| - */ - -#define XFS_GET_DIR_INO4(di) \ - (((__u32)(di).i[0] << 24) | ((di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) - -#define XFS_PUT_DIR_INO4(from, di) \ -do { \ - (di).i[0] = (((from) & 0xff000000ULL) >> 24); \ - (di).i[1] = (((from) & 0x00ff0000ULL) >> 16); \ - (di).i[2] = (((from) & 0x0000ff00ULL) >> 8); \ - (di).i[3] = ((from) & 0x000000ffULL); \ -} while (0) - -#define XFS_DI_HI(di) \ - (((__u32)(di).i[1] << 16) | ((di).i[2] << 8) | ((di).i[3])) -#define XFS_DI_LO(di) \ - (((__u32)(di).i[4] << 24) | ((di).i[5] << 16) | ((di).i[6] << 8) | ((di).i[7])) - -#define XFS_GET_DIR_INO8(di) \ - (((xfs_ino_t)XFS_DI_LO(di) & 0xffffffffULL) | \ - ((xfs_ino_t)XFS_DI_HI(di) << 32)) - -#define XFS_PUT_DIR_INO8(from, di) \ -do { \ - (di).i[0] = 0; \ - (di).i[1] = (((from) & 0x00ff000000000000ULL) >> 48); \ - (di).i[2] = (((from) & 0x0000ff0000000000ULL) >> 40); \ - (di).i[3] = (((from) & 0x000000ff00000000ULL) >> 32); \ - (di).i[4] = (((from) & 0x00000000ff000000ULL) >> 24); \ - (di).i[5] = (((from) & 0x0000000000ff0000ULL) >> 16); \ - (di).i[6] = (((from) & 0x000000000000ff00ULL) >> 8); \ - (di).i[7] = ((from) & 0x00000000000000ffULL); \ -} while (0) - -#endif /* __XFS_ARCH_H__ */ diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 1a210104327..bfe36fc2cdc 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -15,40 +15,34 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ - -#include <linux/capability.h> - #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" #include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_alloc.h" -#include "xfs_btree.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" #include "xfs_error.h" #include "xfs_quota.h" #include "xfs_trans_space.h" -#include "xfs_acl.h" -#include "xfs_rw.h" +#include "xfs_trace.h" +#include "xfs_dinode.h" /* * xfs_attr.c @@ -56,11 +50,6 @@ * Provide the external interfaces to manage attribute lists. */ -#define ATTR_SYSCOUNT 2 -STATIC struct attrnames posix_acl_access; -STATIC struct attrnames posix_acl_default; -STATIC struct attrnames *attr_system_names[ATTR_SYSCOUNT]; - /*======================================================================== * Function prototypes for the kernel. *========================================================================*/ @@ -76,7 +65,6 @@ STATIC int xfs_attr_shortform_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args); STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); -STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); /* * Internal routines when attribute list is more than one block. @@ -84,172 +72,177 @@ STATIC int xfs_attr_leaf_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_node_get(xfs_da_args_t *args); STATIC int xfs_attr_node_addname(xfs_da_args_t *args); STATIC int xfs_attr_node_removename(xfs_da_args_t *args); -STATIC int xfs_attr_node_list(xfs_attr_list_context_t *context); STATIC int xfs_attr_fillstate(xfs_da_state_t *state); STATIC int xfs_attr_refillstate(xfs_da_state_t *state); -/* - * Routines to manipulate out-of-line attribute values. - */ -STATIC int xfs_attr_rmtval_get(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_set(xfs_da_args_t *args); -STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); -#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ +STATIC int +xfs_attr_args_init( + struct xfs_da_args *args, + struct xfs_inode *dp, + const unsigned char *name, + int flags) +{ -#if defined(XFS_ATTR_TRACE) -ktrace_t *xfs_attr_trace_buf; -#endif + if (!name) + return EINVAL; + + memset(args, 0, sizeof(*args)); + args->geo = dp->i_mount->m_attr_geo; + args->whichfork = XFS_ATTR_FORK; + args->dp = dp; + args->flags = flags; + args->name = name; + args->namelen = strlen((const char *)name); + if (args->namelen >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + args->hashval = xfs_da_hashname(args->name, args->namelen); + return 0; +} + +int +xfs_inode_hasattr( + struct xfs_inode *ip) +{ + if (!XFS_IFORK_Q(ip) || + (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_anextents == 0)) + return 0; + return 1; +} /*======================================================================== * Overall external interface routines. *========================================================================*/ int -xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, - char *value, int *valuelenp, int flags, struct cred *cred) +xfs_attr_get( + struct xfs_inode *ip, + const unsigned char *name, + unsigned char *value, + int *valuelenp, + int flags) { - xfs_da_args_t args; - int error; + struct xfs_da_args args; + uint lock_mode; + int error; - if ((XFS_IFORK_Q(ip) == 0) || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) - return(ENOATTR); + XFS_STATS_INC(xs_attr_get); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EIO; + + if (!xfs_inode_hasattr(ip)) + return ENOATTR; + + error = xfs_attr_args_init(&args, ip, name, flags); + if (error) + return error; - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; args.value = value; args.valuelen = *valuelenp; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = ip; - args.whichfork = XFS_ATTR_FORK; - /* - * Decide on what work routines to call based on the inode size. - */ - if (XFS_IFORK_Q(ip) == 0 || - (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_anextents == 0)) { - error = XFS_ERROR(ENOATTR); - } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + lock_mode = xfs_ilock_attr_map_shared(ip); + if (!xfs_inode_hasattr(ip)) + error = ENOATTR; + else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) error = xfs_attr_shortform_getvalue(&args); - } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) { + else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) error = xfs_attr_leaf_get(&args); - } else { + else error = xfs_attr_node_get(&args); - } + xfs_iunlock(ip, lock_mode); + + *valuelenp = args.valuelen; + return error == EEXIST ? 0 : error; +} + +/* + * Calculate how many blocks we need for the new attribute, + */ +STATIC int +xfs_attr_calc_size( + struct xfs_da_args *args, + int *local) +{ + struct xfs_mount *mp = args->dp->i_mount; + int size; + int nblks; /* - * Return the number of bytes in the value to the caller. + * Determine space new attribute will use, and if it would be + * "local" or "remote" (note: local != inline). */ - *valuelenp = args.valuelen; + size = xfs_attr_leaf_newentsize(args, local); + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + if (*local) { + if (size > (args->geo->blksize / 2)) { + /* Double split possible */ + nblks *= 2; + } + } else { + /* + * Out of line attribute, cannot double split, but + * make room for the attribute value itself. + */ + uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen); + nblks += dblocks; + nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + } - if (error == EEXIST) - error = 0; - return(error); + return nblks; } int -xfs_attr_get(bhv_desc_t *bdp, const char *name, char *value, int *valuelenp, - int flags, struct cred *cred) +xfs_attr_set( + struct xfs_inode *dp, + const unsigned char *name, + unsigned char *value, + int valuelen, + int flags) { - xfs_inode_t *ip = XFS_BHVTOI(bdp); - int error, namelen; + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + struct xfs_trans_res tres; + xfs_fsblock_t firstblock; + int rsvd = (flags & ATTR_ROOT) != 0; + int error, err2, committed, local; - XFS_STATS_INC(xs_attr_get); - - if (!name) - return(EINVAL); - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return(EFAULT); /* match IRIX behaviour */ - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return(EIO); + XFS_STATS_INC(xs_attr_set); - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return(error); -} + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; -STATIC int -xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, - char *value, int valuelen, int flags) -{ - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error, err2, committed; - int local, size; - uint nblks; - xfs_mount_t *mp = dp->i_mount; - int rsvd = (flags & ATTR_ROOT) != 0; + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; - /* - * Attach the dquots to the inode. - */ - if ((error = XFS_QM_DQATTACH(mp, dp, 0))) - return (error); + args.value = value; + args.valuelen = valuelen; + args.firstblock = &firstblock; + args.flist = &flist; + args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + args.total = xfs_attr_calc_size(&args, &local); - /* - * Determine space new attribute will use, and if it would be - * "local" or "remote" (note: local != inline). - */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, - mp->m_sb.sb_blocksize, &local); + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; /* * If the inode doesn't have an attribute fork, add one. * (inode must not be locked when we call this routine) */ if (XFS_IFORK_Q(dp) == 0) { - if ((error = xfs_bmap_add_attrfork(dp, size, rsvd))) - return(error); - } + int sf_size = sizeof(xfs_attr_sf_hdr_t) + + XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen); - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; - args.value = value; - args.valuelen = valuelen; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; - args.firstblock = &firstblock; - args.flist = &flist; - args.whichfork = XFS_ATTR_FORK; - args.addname = 1; - args.oknoent = 1; - - nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); - if (local) { - if (size > (mp->m_sb.sb_blocksize >> 1)) { - /* Double split possible */ - nblks <<= 1; - } - } else { - uint dblocks = XFS_B_TO_FSB(mp, valuelen); - /* Out of line attribute, cannot double split, but make - * room for the attribute value itself. - */ - nblks += dblocks; - nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); + error = xfs_bmap_add_attrfork(dp, sf_size, rsvd); + if (error) + return error; } - /* Size is now blocks for attribute data */ - args.total = nblks; - /* * Start our first transaction of the day. * @@ -270,34 +263,35 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, if (rsvd) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, (uint) nblks, - XFS_ATTRSET_LOG_RES(mp, nblks), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRSET_LOG_COUNT))) { + tres.tr_logres = M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * args.total; + tres.tr_logcount = XFS_ATTRSET_LOG_COUNT; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(args.trans, &tres, args.total, 0); + if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0, - rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); + error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0, + rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); - return (error); + return error; } - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); /* * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ - if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) && - (dp->i_d.di_anextents == 0))) { + if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL || + (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && + dp->i_d.di_anextents == 0)) { /* * Build initial attribute list (if required). @@ -322,32 +316,29 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * the transaction goes to disk before returning * to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); + + if (!error && (flags & ATTR_KERNOTIME) == 0) { + xfs_trans_ichgtime(args.trans, dp, + XFS_ICHGTIME_CHG); } err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES, - NULL); + XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } - return(error == 0 ? err2 : error); + return error ? error : err2; } /* * It won't fit in the shortform, transform to a leaf block. * GROT: another possible req'mt for a double-split btree op. */ - XFS_BMAP_INIT(args.flist, args.firstblock); + xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); if (!error) { error = xfs_bmap_finish(&args.trans, args.flist, - *args.firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -360,114 +351,96 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); - } + if (committed) + xfs_trans_ijoin(args.trans, dp, 0); /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - if ((error = xfs_attr_rolltrans(&args.trans, dp))) + + error = xfs_trans_roll(&args.trans, dp); + if (error) goto out; } - if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) error = xfs_attr_leaf_addname(&args); - } else { + else error = xfs_attr_node_addname(&args); - } - if (error) { + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } - - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + } xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } +/* + * Generic handler routine to remove a name from an attribute list. + * Transitions attribute list from Btree to shortform as necessary. + */ int -xfs_attr_set(bhv_desc_t *bdp, const char *name, char *value, int valuelen, int flags, - struct cred *cred) +xfs_attr_remove( + struct xfs_inode *dp, + const unsigned char *name, + int flags) { - xfs_inode_t *dp; - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + struct xfs_mount *mp = dp->i_mount; + struct xfs_da_args args; + struct xfs_bmap_free flist; + xfs_fsblock_t firstblock; + int error; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(xs_attr_remove); - dp = XFS_BHVTOI(bdp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); + return EIO; - return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags); -} + if (!xfs_inode_hasattr(dp)) + return ENOATTR; -/* - * Generic handler routine to remove a name from an attribute list. - * Transitions attribute list from Btree to shortform as necessary. - */ -STATIC int -xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) -{ - xfs_da_args_t args; - xfs_fsblock_t firstblock; - xfs_bmap_free_t flist; - int error; - xfs_mount_t *mp = dp->i_mount; + error = xfs_attr_args_init(&args, dp, name, flags); + if (error) + return error; - /* - * Fill in the arg structure for this request. - */ - memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; - args.flags = flags; - args.hashval = xfs_da_hashname(args.name, args.namelen); - args.dp = dp; args.firstblock = &firstblock; args.flist = &flist; - args.total = 0; - args.whichfork = XFS_ATTR_FORK; /* - * Attach the dquots to the inode. + * we have no control over the attribute names that userspace passes us + * to remove, so we have to allow the name lookup prior to attribute + * removal to fail. */ - if ((error = XFS_QM_DQATTACH(mp, dp, 0))) - return (error); + args.op_flags = XFS_DA_OP_OKNOENT; + + error = xfs_qm_dqattach(dp, 0); + if (error) + return error; /* * Start our first transaction of the day. @@ -489,13 +462,11 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) if (flags & ATTR_ROOT) args.trans->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(args.trans, - XFS_ATTRRM_SPACE_RES(mp), - XFS_ATTRRM_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ATTRRM_LOG_COUNT))) { + error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, + XFS_ATTRRM_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(args.trans, 0); - return(error); + return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -503,276 +474,50 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) * No need to make quota reservations here. We expect to release some * blocks not allocate in the common case. */ - xfs_trans_ijoin(args.trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args.trans, dp); + xfs_trans_ijoin(args.trans, dp, 0); - /* - * Decide on what work routines to call based on the inode size. - */ - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { + if (!xfs_inode_hasattr(dp)) { error = XFS_ERROR(ENOATTR); - goto out; - } - if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); error = xfs_attr_shortform_remove(&args); - if (error) { - goto out; - } } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { error = xfs_attr_leaf_removename(&args); } else { error = xfs_attr_node_removename(&args); } - if (error) { + + if (error) goto out; - } /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { + if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(args.trans); - } + + if ((flags & ATTR_KERNOTIME) == 0) + xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); /* * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); + error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(dp, XFS_ILOCK_EXCL); - /* - * Hit the inode change time. - */ - if (!error && (flags & ATTR_KERNOTIME) == 0) { - xfs_ichgtime(dp, XFS_ICHGTIME_CHG); - } - - return(error); + return error; out: - if (args.trans) + if (args.trans) { xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); -} - -int -xfs_attr_remove(bhv_desc_t *bdp, const char *name, int flags, struct cred *cred) -{ - xfs_inode_t *dp; - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ - - XFS_STATS_INC(xs_attr_remove); - - dp = XFS_BHVTOI(bdp); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(XFS_ERROR(ENOATTR)); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - return xfs_attr_remove_int(dp, name, namelen, flags); -} - -/* - * Generate a list of extended attribute names and optionally - * also value lengths. Positive return value follows the XFS - * convention of being an error, zero or negative return code - * is the length of the buffer returned (negated), indicating - * success. - */ -int -xfs_attr_list(bhv_desc_t *bdp, char *buffer, int bufsize, int flags, - attrlist_cursor_kern_t *cursor, struct cred *cred) -{ - xfs_attr_list_context_t context; - xfs_inode_t *dp; - int error; - - XFS_STATS_INC(xs_attr_list); - - /* - * Validate the cursor. - */ - if (cursor->pad1 || cursor->pad2) - return(XFS_ERROR(EINVAL)); - if ((cursor->initted == 0) && - (cursor->hashval || cursor->blkno || cursor->offset)) - return(XFS_ERROR(EINVAL)); - - /* - * Check for a properly aligned buffer. - */ - if (((long)buffer) & (sizeof(int)-1)) - return(XFS_ERROR(EFAULT)); - if (flags & ATTR_KERNOVAL) - bufsize = 0; - - /* - * Initialize the output buffer. - */ - context.dp = dp = XFS_BHVTOI(bdp); - context.cursor = cursor; - context.count = 0; - context.dupcnt = 0; - context.resynch = 1; - context.flags = flags; - if (!(flags & ATTR_KERNAMELS)) { - context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ - context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; - context.alist->al_count = 0; - context.alist->al_more = 0; - context.alist->al_offset[0] = context.bufsize; - } - else { - context.bufsize = bufsize; - context.firstu = context.bufsize; - context.alist = (attrlist_t *)buffer; - } - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return (EIO); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - /* - * Decide on what work routines to call based on the inode size. - */ - xfs_attr_trace_l_c("syscall start", &context); - if (XFS_IFORK_Q(dp) == 0 || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - error = 0; - } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = xfs_attr_shortform_list(&context); - } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { - error = xfs_attr_leaf_list(&context); - } else { - error = xfs_attr_node_list(&context); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - xfs_attr_trace_l_c("syscall end", &context); - - if (!(context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS))) { - ASSERT(error >= 0); - } - else { /* must return negated buffer size or the error */ - if (context.count < 0) - error = XFS_ERROR(ERANGE); - else - error = -context.count; - } - - return(error); -} - -int /* error */ -xfs_attr_inactive(xfs_inode_t *dp) -{ - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; - - mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - - xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return(0); - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); - - /* - * Start our first transaction of the day. - * - * All future transactions during this code must be "chained" off - * this one via the trans_dup() call. All transactions will contain - * the inode, and the inode will always be marked with trans_ihold(). - * Since the inode will be locked in all transactions, we must log - * the inode in every transaction to let it float upward through - * the log. - */ - trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); - if ((error = xfs_trans_reserve(trans, 0, XFS_ATTRINVAL_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ATTRINVAL_LOG_COUNT))) { - xfs_trans_cancel(trans, 0); - return(error); } - xfs_ilock(dp, XFS_ILOCK_EXCL); - - /* - * No need to make quota reservations here. We expect to release some - * blocks, not allocate, in the common case. - */ - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); - - /* - * Decide on what work routines to call based on the inode size. - */ - if ((XFS_IFORK_Q(dp) == 0) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || - (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && - dp->i_d.di_anextents == 0)) { - error = 0; - goto out; - } - error = xfs_attr_root_inactive(&trans, dp); - if (error) - goto out; - /* - * signal synchronous inactive transactions unless this - * is a synchronous mount filesystem in which case we - * know that we're here because we've been called out of - * xfs_inactive which means that the last reference is gone - * and the unlink transaction has already hit the disk so - * async inactive transactions are safe. - */ - if ((error = xfs_itruncate_finish(&trans, dp, 0LL, XFS_ATTR_FORK, - (!(mp->m_flags & XFS_MOUNT_WSYNC) - ? 1 : 0)))) - goto out; - - /* - * Commit the last in the sequence of transactions. - */ - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES, - NULL); xfs_iunlock(dp, XFS_ILOCK_EXCL); - - return(error); - -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); - return(error); + return error; } - - /*======================================================================== * External routines when attribute list is inside the inode *========================================================================*/ @@ -786,6 +531,8 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) { int newsize, forkoff, retval; + trace_xfs_attr_sf_addname(args); + retval = xfs_attr_shortform_lookup(args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { return(retval); @@ -822,61 +569,74 @@ xfs_attr_shortform_addname(xfs_da_args_t *args) * This leaf block cannot have a "remote" value, we only call this routine * if bmap_one_block() says there is only one block (ie: no remote blks). */ -int +STATIC int xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; + trace_xfs_attr_leaf_addname(args); + /* * Read the (only) block in the attribute list in. */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; /* * Look up the given attribute in the leaf block. Figure out if * the given flags produce an error or call for an atomic rename. */ - retval = xfs_attr_leaf_lookup_int(bp, args); + retval = xfs_attr3_leaf_lookup_int(bp, args); if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { - xfs_da_brelse(args->trans, bp); - return(retval); + xfs_trans_brelse(args->trans, bp); + return retval; } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) { /* pure create op */ - xfs_da_brelse(args->trans, bp); - return(retval); + xfs_trans_brelse(args->trans, bp); + return retval; } - args->rename = 1; /* an atomic rename */ + + trace_xfs_attr_leaf_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* * Add the attribute to the leaf block, transitioning to a Btree * if required. */ - retval = xfs_attr_leaf_add(bp, args); - xfs_da_buf_done(bp); + retval = xfs_attr3_leaf_add(bp, args); if (retval == ENOSPC) { /* * Promote the attribute list to the Btree format, then * Commit that transaction so that the node_addname() call * can manage its own transactions. */ - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -889,16 +649,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the current trans (including the inode) and start * a new one. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -912,7 +671,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) return (error); /* @@ -933,12 +693,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) return(error); @@ -950,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -960,24 +721,23 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Read in the block containing the "old" attr, then * remove the "old" attr from that block (neat, huh!) */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, - &bp, XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, + -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); - (void)xfs_attr_leaf_remove(bp, args); + return error; + + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -992,25 +752,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } - } else - xfs_da_buf_done(bp); + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } /* * Commit the remove and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, dp); + error = xfs_trans_roll(&args->trans, dp); } else if (args->rmtblkno > 0) { /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); } - return(error); + return error; } /* @@ -1023,58 +780,54 @@ STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error, committed, forkoff; + trace_xfs_attr_leaf_removename(args); + /* * Remove the attribute. */ dp = args->dp; args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { - return(error); - } + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) + return error; - ASSERT(bp != NULL); - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error == ENOATTR) { - xfs_da_brelse(args->trans, bp); - return(error); + xfs_trans_brelse(args->trans, bp); + return error; } - (void)xfs_attr_leaf_remove(bp, args); + xfs_attr3_leaf_remove(bp, args); /* * If the result is small enough, shrink it all into the inode. */ if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); - return(error); + return error; } /* * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } - } else - xfs_da_buf_done(bp); - return(0); + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + } + return 0; } /* @@ -1086,60 +839,31 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) STATIC int xfs_attr_leaf_get(xfs_da_args_t *args) { - xfs_dabuf_t *bp; + struct xfs_buf *bp; int error; + trace_xfs_attr_leaf_get(args); + args->blkno = 0; - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); if (error) - return(error); - ASSERT(bp != NULL); + return error; - error = xfs_attr_leaf_lookup_int(bp, args); + error = xfs_attr3_leaf_lookup_int(bp, args); if (error != EEXIST) { - xfs_da_brelse(args->trans, bp); - return(error); + xfs_trans_brelse(args->trans, bp); + return error; } - error = xfs_attr_leaf_getvalue(bp, args); - xfs_da_brelse(args->trans, bp); + error = xfs_attr3_leaf_getvalue(bp, args); + xfs_trans_brelse(args->trans, bp); if (!error && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { error = xfs_attr_rmtval_get(args); } - return(error); -} - -/* - * Copy out attribute entries for attr_list(), for leaf attribute lists. - */ -STATIC int -xfs_attr_leaf_list(xfs_attr_list_context_t *context) -{ - xfs_attr_leafblock_t *leaf; - int error; - xfs_dabuf_t *bp; - - context->cursor->blkno = 0; - error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK); - if (error) - return(error); - ASSERT(bp != NULL); - leaf = bp->data; - if (unlikely(be16_to_cpu(leaf->hdr.info.magic) != XFS_ATTR_LEAF_MAGIC)) { - XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } - - (void)xfs_attr_leaf_list_int(bp, context); - xfs_da_brelse(NULL, bp); - return(0); + return error; } - /*======================================================================== - * External routines when attribute list size > XFS_LBSIZE(mp). + * External routines when attribute list size > geo->blksize *========================================================================*/ /* @@ -1161,6 +885,8 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_mount_t *mp; int committed, retval, error; + trace_xfs_attr_node_addname(args); + /* * Fill in bucket of arguments/results/context to carry around. */ @@ -1170,14 +896,12 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name already exists, and get back a pointer * to where it should go. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; blk = &state->path.blk[ state->path.active-1 ]; @@ -1187,16 +911,28 @@ restart: } else if (retval == EEXIST) { if (args->flags & ATTR_CREATE) goto out; - args->rename = 1; /* atomic rename op */ + + trace_xfs_attr_node_replace(args); + + /* save the attribute state for later removal*/ + args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } - retval = xfs_attr_leaf_add(blk->bp, state->args); + retval = xfs_attr3_leaf_add(blk->bp, state->args); if (retval == ENOSPC) { if (state->path.active == 1) { /* @@ -1205,12 +941,12 @@ restart: * have been a b-tree. */ xfs_da_state_free(state); - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_attr_leaf_to_node(args); + state = NULL; + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_node(args); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1225,16 +961,15 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the node conversion and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; goto restart; @@ -1246,11 +981,11 @@ restart: * in the index/blkno/rmtblkno/rmtblkcnt fields and * in the index2/blkno2/rmtblkno2/rmtblkcnt2 fields. */ - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_da_split(state); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_split(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1263,15 +998,13 @@ restart: * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); } /* @@ -1285,7 +1018,8 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; /* @@ -1306,12 +1040,12 @@ restart: * so that one disappears and one appears atomically. Then we * must remove the "old" attribute/value pair. */ - if (args->rename) { + if (args->op_flags & XFS_DA_OP_RENAME) { /* * In a separate transaction, set the incomplete flag on the * "old" attr and clear the incomplete flag on the "new" attr. */ - error = xfs_attr_leaf_flipflags(args); + error = xfs_attr3_leaf_flipflags(args); if (error) goto out; @@ -1323,6 +1057,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -1338,10 +1073,8 @@ restart: state = xfs_da_state_alloc(); state->args = args; state->mp = mp; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; state->inleaf = 0; - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) goto out; @@ -1350,19 +1083,18 @@ restart: */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - error = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + error = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_da_join(state); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1377,23 +1109,22 @@ restart: * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); } /* * Commit and start the next trans in the chain. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } else if (args->rmtblkno > 0) { /* * Added a "remote" value, just clear the incomplete flag. */ - error = xfs_attr_leaf_clearflag(args); + error = xfs_attr3_leaf_clearflag(args); if (error) goto out; } @@ -1420,9 +1151,11 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_t *state; xfs_da_state_blk_t *blk; xfs_inode_t *dp; - xfs_dabuf_t *bp; + struct xfs_buf *bp; int retval, error, committed, forkoff; + trace_xfs_attr_node_removename(args); + /* * Tie a string around our finger to remind us where we are. */ @@ -1430,13 +1163,11 @@ xfs_attr_node_removename(xfs_da_args_t *args) state = xfs_da_state_alloc(); state->args = args; state->mp = dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error || (retval != EEXIST)) { if (error == 0) error = retval; @@ -1465,7 +1196,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) * Mark the attribute as INCOMPLETE, then bunmapi() the * remote value. */ - error = xfs_attr_leaf_setflag(args); + error = xfs_attr3_leaf_setflag(args); if (error) goto out; error = xfs_attr_rmtval_remove(args); @@ -1486,18 +1217,18 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ blk = &state->path.blk[ state->path.active-1 ]; ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); - retval = xfs_attr_leaf_remove(blk->bp, args); - xfs_da_fixhashpath(state, &state->path); + retval = xfs_attr3_leaf_remove(blk->bp, args); + xfs_da3_fixhashpath(state, &state->path); /* * Check to see if the tree needs to be collapsed. */ if (retval && (state->path.active > 1)) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_da_join(state); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_da3_join(state); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); + &committed); } if (error) { ASSERT(committed); @@ -1510,15 +1241,14 @@ xfs_attr_node_removename(xfs_da_args_t *args) * bmap_finish() may have committed the last trans and started * a new one. We need the inode to be in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); /* * Commit the Btree join operation and start a new trans. */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) + error = xfs_trans_roll(&args->trans, dp); + if (error) goto out; } @@ -1531,25 +1261,19 @@ xfs_attr_node_removename(xfs_da_args_t *args) */ ASSERT(state->path.active == 1); ASSERT(state->path.blk[0].bp); - xfs_da_buf_done(state->path.blk[0].bp); state->path.blk[0].bp = NULL; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp, - XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(args->trans, args->dp, 0, -1, &bp); if (error) goto out; - ASSERT(be16_to_cpu(((xfs_attr_leafblock_t *) - bp->data)->hdr.info.magic) - == XFS_ATTR_LEAF_MAGIC); if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_attr_leaf_to_shortform(bp, args, forkoff); + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); } if (error) { @@ -1564,12 +1288,10 @@ xfs_attr_node_removename(xfs_da_args_t *args) * and started a new one. We need the inode to be * in all transactions. */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); } else - xfs_da_brelse(args->trans, bp); + xfs_trans_brelse(args->trans, bp); } error = 0; @@ -1591,6 +1313,8 @@ xfs_attr_fillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level; + trace_xfs_attr_fillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1599,8 +1323,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1615,8 +1338,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = xfs_da_blkno(blk->bp); - xfs_da_buf_done(blk->bp); + blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1639,6 +1361,8 @@ xfs_attr_refillstate(xfs_da_state_t *state) xfs_da_state_blk_t *blk; int level, error; + trace_xfs_attr_refillstate(state->args); + /* * Roll down the "path" in the state structure, storing the on-disk * block number for those buffers in the "path". @@ -1647,7 +1371,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1666,7 +1390,7 @@ xfs_attr_refillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->disk_blkno) { - error = xfs_da_read_buf(state->args->trans, + error = xfs_da3_node_read(state->args->trans, state->args->dp, blk->blkno, blk->disk_blkno, &blk->bp, XFS_ATTR_FORK); @@ -1695,16 +1419,16 @@ xfs_attr_node_get(xfs_da_args_t *args) int error, retval; int i; + trace_xfs_attr_node_get(args); + state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_sb.sb_blocksize; - state->node_ents = state->mp->m_attr_node_ents; /* * Search to see if name exists, and get back a pointer to it. */ - error = xfs_da_node_lookup_int(state, &retval); + error = xfs_da3_node_lookup_int(state, &retval); if (error) { retval = error; } else if (retval == EEXIST) { @@ -1715,7 +1439,7 @@ xfs_attr_node_get(xfs_da_args_t *args) /* * Get the value, local or "remote" */ - retval = xfs_attr_leaf_getvalue(blk->bp, args); + retval = xfs_attr3_leaf_getvalue(blk->bp, args); if (!retval && (args->rmtblkno > 0) && !(args->flags & ATTR_KERNOVAL)) { retval = xfs_attr_rmtval_get(args); @@ -1726,907 +1450,10 @@ xfs_attr_node_get(xfs_da_args_t *args) * If not in a transaction, we have to release all the buffers. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } xfs_da_state_free(state); return(retval); } - -STATIC int /* error */ -xfs_attr_node_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int error, i; - xfs_dabuf_t *bp; - - cursor = context->cursor; - cursor->initted = 1; - - /* - * Do all sorts of validation on the passed-in cursor structure. - * If anything is amiss, ignore the cursor and look up the hashval - * starting from the btree root. - */ - bp = NULL; - if (cursor->blkno > 0) { - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); - if ((error != 0) && (error != EFSCORRUPTED)) - return(error); - if (bp) { - node = bp->data; - switch (be16_to_cpu(node->hdr.info.magic)) { - case XFS_DA_NODE_MAGIC: - xfs_attr_trace_l_cn("wrong blk", context, node); - xfs_da_brelse(NULL, bp); - bp = NULL; - break; - case XFS_ATTR_LEAF_MAGIC: - leaf = bp->data; - if (cursor->hashval > be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)) { - xfs_attr_trace_l_cl("wrong blk", - context, leaf); - xfs_da_brelse(NULL, bp); - bp = NULL; - } else if (cursor->hashval <= - be32_to_cpu(leaf->entries[0].hashval)) { - xfs_attr_trace_l_cl("maybe wrong blk", - context, leaf); - xfs_da_brelse(NULL, bp); - bp = NULL; - } - break; - default: - xfs_attr_trace_l_c("wrong blk - ??", context); - xfs_da_brelse(NULL, bp); - bp = NULL; - } - } - } - - /* - * We did not find what we expected given the cursor's contents, - * so we start from the top and work down based on the hash value. - * Note that start of node block is same as start of leaf block. - */ - if (bp == NULL) { - cursor->blkno = 0; - for (;;) { - error = xfs_da_read_buf(NULL, context->dp, - cursor->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) - return(error); - if (unlikely(bp == NULL)) { - XFS_ERROR_REPORT("xfs_attr_node_list(2)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } - node = bp->data; - if (be16_to_cpu(node->hdr.info.magic) - == XFS_ATTR_LEAF_MAGIC) - break; - if (unlikely(be16_to_cpu(node->hdr.info.magic) - != XFS_DA_NODE_MAGIC)) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, - node); - xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } - btree = node->btree; - for (i = 0; i < be16_to_cpu(node->hdr.count); - btree++, i++) { - if (cursor->hashval - <= be32_to_cpu(btree->hashval)) { - cursor->blkno = be32_to_cpu(btree->before); - xfs_attr_trace_l_cb("descending", - context, btree); - break; - } - } - if (i == be16_to_cpu(node->hdr.count)) { - xfs_da_brelse(NULL, bp); - return(0); - } - xfs_da_brelse(NULL, bp); - } - } - ASSERT(bp != NULL); - - /* - * Roll upward through the blocks, processing each leaf block in - * order. As long as there is space in the result buffer, keep - * adding the information. - */ - for (;;) { - leaf = bp->data; - if (unlikely(be16_to_cpu(leaf->hdr.info.magic) - != XFS_ATTR_LEAF_MAGIC)) { - XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, leaf); - xfs_da_brelse(NULL, bp); - return(XFS_ERROR(EFSCORRUPTED)); - } - error = xfs_attr_leaf_list_int(bp, context); - if (error || !leaf->hdr.info.forw) - break; /* not really an error, buffer full or EOF */ - cursor->blkno = be32_to_cpu(leaf->hdr.info.forw); - xfs_da_brelse(NULL, bp); - error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1, - &bp, XFS_ATTR_FORK); - if (error) - return(error); - if (unlikely((bp == NULL))) { - XFS_ERROR_REPORT("xfs_attr_node_list(5)", - XFS_ERRLEVEL_LOW, - context->dp->i_mount); - return(XFS_ERROR(EFSCORRUPTED)); - } - } - xfs_da_brelse(NULL, bp); - return(0); -} - - -/*======================================================================== - * External routines for manipulating out-of-line attribute values. - *========================================================================*/ - -/* - * Read the value associated with an attribute from the out-of-line buffer - * that we stored it in. - */ -STATIC int -xfs_attr_rmtval_get(xfs_da_args_t *args) -{ - xfs_bmbt_irec_t map[ATTR_RMTVALUE_MAPSIZE]; - xfs_mount_t *mp; - xfs_daddr_t dblkno; - xfs_caddr_t dst; - xfs_buf_t *bp; - int nmap, error, tmp, valuelen, blkcnt, i; - xfs_dablk_t lblkno; - - ASSERT(!(args->flags & ATTR_KERNOVAL)); - - mp = args->dp->i_mount; - dst = args->value; - valuelen = args->valuelen; - lblkno = args->rmtblkno; - while (valuelen > 0) { - nmap = ATTR_RMTVALUE_MAPSIZE; - error = xfs_bmapi(args->trans, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, map, &nmap, NULL, NULL); - if (error) - return(error); - ASSERT(nmap >= 1); - - for (i = 0; (i < nmap) && (valuelen > 0); i++) { - ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && - (map[i].br_startblock != HOLESTARTBLOCK)); - dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); - if (error) - return(error); - - tmp = (valuelen < XFS_BUF_SIZE(bp)) - ? valuelen : XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, dst, XFS_B_READ); - xfs_buf_relse(bp); - dst += tmp; - valuelen -= tmp; - - lblkno += map[i].br_blockcount; - } - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Write the value associated with an attribute into the out-of-line buffer - * that we have defined for it. - */ -STATIC int -xfs_attr_rmtval_set(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_fileoff_t lfileoff; - xfs_inode_t *dp; - xfs_bmbt_irec_t map; - xfs_daddr_t dblkno; - xfs_caddr_t src; - xfs_buf_t *bp; - xfs_dablk_t lblkno; - int blkcnt, valuelen, nmap, error, tmp, committed; - - dp = args->dp; - mp = dp->i_mount; - src = args->value; - - /* - * Find a "hole" in the attribute address space large enough for - * us to drop the new attribute's value into. - */ - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - lfileoff = 0; - error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, - XFS_ATTR_FORK); - if (error) { - return(error); - } - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; - args->rmtblkcnt = blkcnt; - - /* - * Roll through the "value", allocating blocks on disk as required. - */ - while (blkcnt > 0) { - /* - * Allocate a single extent, up to the size of the value. - */ - XFS_BMAP_INIT(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA | - XFS_BMAPI_WRITE, - args->firstblock, args->total, &map, &nmap, - args->flist, NULL); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) { - xfs_trans_ijoin(args->trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, dp); - } - - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - lblkno += map.br_blockcount; - blkcnt -= map.br_blockcount; - - /* - * Start the next trans in the chain. - */ - if ((error = xfs_attr_rolltrans(&args->trans, dp))) - return (error); - } - - /* - * Roll through the "value", copying the attribute value to the - * already-allocated blocks. Blocks are written synchronously - * so that we can know they are all on disk before we turn off - * the INCOMPLETE flag. - */ - lblkno = args->rmtblkno; - valuelen = args->valuelen; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - XFS_BMAP_INIT(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi(NULL, dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - NULL, NULL); - if (error) { - return(error); - } - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - - tmp = (valuelen < XFS_BUF_SIZE(bp)) ? valuelen : - XFS_BUF_SIZE(bp); - xfs_biomove(bp, 0, tmp, src, XFS_B_WRITE); - if (tmp < XFS_BUF_SIZE(bp)) - xfs_biozero(bp, tmp, XFS_BUF_SIZE(bp) - tmp); - if ((error = xfs_bwrite(mp, bp))) {/* GROT: NOTE: synchronous write */ - return (error); - } - src += tmp; - valuelen -= tmp; - - lblkno += map.br_blockcount; - } - ASSERT(valuelen == 0); - return(0); -} - -/* - * Remove the value associated with an attribute by deleting the - * out-of-line buffer that it is stored on. - */ -STATIC int -xfs_attr_rmtval_remove(xfs_da_args_t *args) -{ - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; - - mp = args->dp->i_mount; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { - /* - * Try to remember where we decided to put the value. - */ - XFS_BMAP_INIT(args->flist, args->firstblock); - nmap = 1; - error = xfs_bmapi(NULL, args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, 0, &map, &nmap, - args->flist, NULL); - if (error) { - return(error); - } - ASSERT(nmap == 1); - ASSERT((map.br_startblock != DELAYSTARTBLOCK) && - (map.br_startblock != HOLESTARTBLOCK)); - - dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - - /* - * If the "remote" value is in the cache, remove it. - */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, - XFS_INCORE_TRYLOCK); - if (bp) { - XFS_BUF_STALE(bp); - XFS_BUF_UNDELAYWRITE(bp); - xfs_buf_relse(bp); - bp = NULL; - } - - valuelen -= map.br_blockcount; - - lblkno += map.br_blockcount; - } - - /* - * Keep de-allocating extents until the remote-value region is gone. - */ - lblkno = args->rmtblkno; - blkcnt = args->rmtblkcnt; - done = 0; - while (!done) { - XFS_BMAP_INIT(args->flist, args->firstblock); - error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - 1, args->firstblock, args->flist, - NULL, &done); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - *args->firstblock, &committed); - } - if (error) { - ASSERT(committed); - args->trans = NULL; - xfs_bmap_cancel(args->flist); - return(error); - } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) { - xfs_trans_ijoin(args->trans, args->dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(args->trans, args->dp); - } - - /* - * Close out trans and start the next one in the chain. - */ - if ((error = xfs_attr_rolltrans(&args->trans, args->dp))) - return (error); - } - return(0); -} - -#if defined(XFS_ATTR_TRACE) -/* - * Add a trace buffer entry for an attr_list context structure. - */ -void -xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, - (__psunsigned_t)NULL, - (__psunsigned_t)NULL, - (__psunsigned_t)NULL); -} - -/* - * Add a trace buffer entry for a context structure and a Btree node. - */ -void -xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, - struct xfs_da_intnode *node) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, - (__psunsigned_t)be16_to_cpu(node->hdr.count), - (__psunsigned_t)be32_to_cpu(node->btree[0].hashval), - (__psunsigned_t)be32_to_cpu(node->btree[ - be16_to_cpu(node->hdr.count)-1].hashval)); -} - -/* - * Add a trace buffer entry for a context structure and a Btree element. - */ -void -xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, - struct xfs_da_node_entry *btree) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, - (__psunsigned_t)be32_to_cpu(btree->hashval), - (__psunsigned_t)be32_to_cpu(btree->before), - (__psunsigned_t)NULL); -} - -/* - * Add a trace buffer entry for a context structure and a leaf block. - */ -void -xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, - struct xfs_attr_leafblock *leaf) -{ - xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, - (__psunsigned_t)context->dp, - (__psunsigned_t)context->cursor->hashval, - (__psunsigned_t)context->cursor->blkno, - (__psunsigned_t)context->cursor->offset, - (__psunsigned_t)context->alist, - (__psunsigned_t)context->bufsize, - (__psunsigned_t)context->count, - (__psunsigned_t)context->firstu, - (__psunsigned_t) - ((context->count > 0) && - !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL))) - ? (ATTR_ENTRY(context->alist, - context->count-1)->a_valuelen) - : 0, - (__psunsigned_t)context->dupcnt, - (__psunsigned_t)context->flags, - (__psunsigned_t)be16_to_cpu(leaf->hdr.count), - (__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval), - (__psunsigned_t)be32_to_cpu(leaf->entries[ - be16_to_cpu(leaf->hdr.count)-1].hashval)); -} - -/* - * Add a trace buffer entry for the arguments given to the routine, - * generic form. - */ -void -xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15) -{ - ASSERT(xfs_attr_trace_buf); - ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type), - (void *)where, - (void *)a2, (void *)a3, (void *)a4, - (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10, - (void *)a11, (void *)a12, (void *)a13, - (void *)a14, (void *)a15); -} -#endif /* XFS_ATTR_TRACE */ - - -/*======================================================================== - * System (pseudo) namespace attribute interface routines. - *========================================================================*/ - -STATIC int -posix_acl_access_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS); -} - -STATIC int -posix_acl_access_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_access(vp); -} - -STATIC int -posix_acl_default_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT); -} - -STATIC int -posix_acl_default_exists( - bhv_vnode_t *vp) -{ - return xfs_acl_vhasacl_default(vp); -} - -STATIC struct attrnames posix_acl_access = { - .attr_name = "posix_acl_access", - .attr_namelen = sizeof("posix_acl_access") - 1, - .attr_get = posix_acl_access_get, - .attr_set = posix_acl_access_set, - .attr_remove = posix_acl_access_remove, - .attr_exists = posix_acl_access_exists, -}; - -STATIC struct attrnames posix_acl_default = { - .attr_name = "posix_acl_default", - .attr_namelen = sizeof("posix_acl_default") - 1, - .attr_get = posix_acl_default_get, - .attr_set = posix_acl_default_set, - .attr_remove = posix_acl_default_remove, - .attr_exists = posix_acl_default_exists, -}; - -STATIC struct attrnames *attr_system_names[] = - { &posix_acl_access, &posix_acl_default }; - - -/*======================================================================== - * Namespace-prefix-style attribute name interface routines. - *========================================================================*/ - -STATIC int -attr_generic_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - return -bhv_vop_attr_set(vp, name, data, size, xflags, NULL); -} - -STATIC int -attr_generic_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - int error, asize = size; - - error = bhv_vop_attr_get(vp, name, data, &asize, xflags, NULL); - if (!error) - return asize; - return -error; -} - -STATIC int -attr_generic_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - return -bhv_vop_attr_remove(vp, name, xflags, NULL); -} - -STATIC int -attr_generic_listadd( - attrnames_t *prefix, - attrnames_t *namesp, - void *data, - size_t size, - ssize_t *result) -{ - char *p = data + *result; - - *result += prefix->attr_namelen; - *result += namesp->attr_namelen + 1; - if (!size) - return 0; - if (*result > size) - return -ERANGE; - strcpy(p, prefix->attr_name); - p += prefix->attr_namelen; - strcpy(p, namesp->attr_name); - p += namesp->attr_namelen + 1; - return 0; -} - -STATIC int -attr_system_list( - bhv_vnode_t *vp, - void *data, - size_t size, - ssize_t *result) -{ - attrnames_t *namesp; - int i, error = 0; - - for (i = 0; i < ATTR_SYSCOUNT; i++) { - namesp = attr_system_names[i]; - if (!namesp->attr_exists || !namesp->attr_exists(vp)) - continue; - error = attr_generic_listadd(&attr_system, namesp, - data, size, result); - if (error) - break; - } - return error; -} - -int -attr_generic_list( - bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result) -{ - attrlist_cursor_kern_t cursor = { 0 }; - int error; - - error = bhv_vop_attr_list(vp, data, size, xflags, &cursor, NULL); - if (error > 0) - return -error; - *result = -error; - return attr_system_list(vp, data, size, result); -} - -attrnames_t * -attr_lookup_namespace( - char *name, - struct attrnames **names, - int nnames) -{ - int i; - - for (i = 0; i < nnames; i++) - if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen)) - return names[i]; - return NULL; -} - -/* - * Some checks to prevent people abusing EAs to get over quota: - * - Don't allow modifying user EAs on devices/symlinks; - * - Don't allow modifying user EAs if sticky bit set; - */ -STATIC int -attr_user_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER)) - return -EPERM; - return 0; -} - -STATIC int -attr_trusted_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - struct inode *inode = vn_to_inode(vp); - - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - return 0; -} - -STATIC int -attr_secure_capable( - bhv_vnode_t *vp, - cred_t *cred) -{ - return -ENOSECURITY; -} - -STATIC int -attr_system_set( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - int error; - - if (xflags & ATTR_CREATE) - return -EINVAL; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - error = namesp->attr_set(vp, name, data, size, xflags); - if (!error) - error = vn_revalidate(vp); - return error; -} - -STATIC int -attr_system_get( - bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_get(vp, name, data, size, xflags); -} - -STATIC int -attr_system_remove( - bhv_vnode_t *vp, char *name, int xflags) -{ - attrnames_t *namesp; - - namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT); - if (!namesp) - return -EOPNOTSUPP; - return namesp->attr_remove(vp, name, xflags); -} - -struct attrnames attr_system = { - .attr_name = "system.", - .attr_namelen = sizeof("system.") - 1, - .attr_flag = ATTR_SYSTEM, - .attr_get = attr_system_get, - .attr_set = attr_system_set, - .attr_remove = attr_system_remove, - .attr_capable = (attrcapable_t)fs_noerr, -}; - -struct attrnames attr_trusted = { - .attr_name = "trusted.", - .attr_namelen = sizeof("trusted.") - 1, - .attr_flag = ATTR_ROOT, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_trusted_capable, -}; - -struct attrnames attr_secure = { - .attr_name = "security.", - .attr_namelen = sizeof("security.") - 1, - .attr_flag = ATTR_SECURE, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_secure_capable, -}; - -struct attrnames attr_user = { - .attr_name = "user.", - .attr_namelen = sizeof("user.") - 1, - .attr_get = attr_generic_get, - .attr_set = attr_generic_set, - .attr_remove = attr_generic_remove, - .attr_capable = attr_user_capable, -}; - -struct attrnames *attr_namespaces[] = - { &attr_system, &attr_trusted, &attr_secure, &attr_user }; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 981633f6c07..dd482458947 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -18,9 +18,11 @@ #ifndef __XFS_ATTR_H__ #define __XFS_ATTR_H__ +struct xfs_inode; +struct xfs_da_args; +struct xfs_attr_list_context; + /* - * xfs_attr.h - * * Large attribute lists are structured around Btrees where all the data * elements are in the leaf nodes. Attribute names are hashed into an int, * then that int is used as the index into the Btree. Since the hashval @@ -35,35 +37,6 @@ * External interfaces *========================================================================*/ -struct cred; -struct bhv_vnode; - -typedef int (*attrset_t)(struct bhv_vnode *, char *, void *, size_t, int); -typedef int (*attrget_t)(struct bhv_vnode *, char *, void *, size_t, int); -typedef int (*attrremove_t)(struct bhv_vnode *, char *, int); -typedef int (*attrexists_t)(struct bhv_vnode *); -typedef int (*attrcapable_t)(struct bhv_vnode *, struct cred *); - -typedef struct attrnames { - char * attr_name; - unsigned int attr_namelen; - unsigned int attr_flag; - attrget_t attr_get; - attrset_t attr_set; - attrremove_t attr_remove; - attrexists_t attr_exists; - attrcapable_t attr_capable; -} attrnames_t; - -#define ATTR_NAMECOUNT 4 -extern struct attrnames attr_user; -extern struct attrnames attr_secure; -extern struct attrnames attr_system; -extern struct attrnames attr_trusted; -extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT]; - -extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int); -extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *); #define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */ #define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */ @@ -71,16 +44,19 @@ extern int attr_generic_list(struct bhv_vnode *, void *, size_t, int, ssize_t *) #define ATTR_SECURE 0x0008 /* use attrs in security namespace */ #define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */ #define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */ -#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */ -#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */ #define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */ #define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */ -#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */ -#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */ -#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */ -#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS) +#define XFS_ATTR_FLAGS \ + { ATTR_DONTFOLLOW, "DONTFOLLOW" }, \ + { ATTR_ROOT, "ROOT" }, \ + { ATTR_TRUST, "TRUST" }, \ + { ATTR_SECURE, "SECURE" }, \ + { ATTR_CREATE, "CREATE" }, \ + { ATTR_REPLACE, "REPLACE" }, \ + { ATTR_KERNOTIME, "KERNOTIME" }, \ + { ATTR_KERNOVAL, "KERNOVAL" } /* * The maximum size (into the kernel or returned from the kernel) of an @@ -119,22 +95,6 @@ typedef struct attrlist_ent { /* data from attr_list() */ &((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ]) /* - * Multi-attribute operation vector. - */ -typedef struct attr_multiop { - int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */ - int am_error; /* [out arg] result of this sub-op (an errno) */ - char *am_attrname; /* attribute name to work with */ - char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */ - int am_length; /* [in/out arg] length of value */ - int am_flags; /* bitwise OR of attr API flags defined above */ -} attr_multiop_t; - -#define ATTR_OP_GET 1 /* return the indicated attr's value */ -#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ -#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ - -/* * Kernel-internal version of the attrlist cursor. */ typedef struct attrlist_cursor_kern { @@ -148,25 +108,47 @@ typedef struct attrlist_cursor_kern { /*======================================================================== - * Function prototypes for the kernel. + * Structure used to pass context around among the routines. *========================================================================*/ -struct xfs_inode; -struct attrlist_cursor_kern; -struct xfs_da_args; + +typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, + unsigned char *, int, int, unsigned char *); + +typedef struct xfs_attr_list_context { + struct xfs_inode *dp; /* inode */ + struct attrlist_cursor_kern *cursor; /* position in list */ + char *alist; /* output buffer */ + int seen_enough; /* T/F: seen enough of list? */ + ssize_t count; /* num used entries */ + int dupcnt; /* count dup hashvals seen */ + int bufsize; /* total buffer size */ + int firstu; /* first used byte in buffer */ + int flags; /* from VOP call */ + int resynch; /* T/F: resynch with cursor */ + int put_value; /* T/F: need value for listent */ + put_listent_func_t put_listent; /* list output fmt function */ + int index; /* index into output buffer */ +} xfs_attr_list_context_t; + + +/*======================================================================== + * Function prototypes for the kernel. + *========================================================================*/ /* * Overall external interface routines. */ -int xfs_attr_get(bhv_desc_t *, const char *, char *, int *, int, struct cred *); -int xfs_attr_set(bhv_desc_t *, const char *, char *, int, int, struct cred *); -int xfs_attr_remove(bhv_desc_t *, const char *, int, struct cred *); -int xfs_attr_list(bhv_desc_t *, char *, int, int, - struct attrlist_cursor_kern *, struct cred *); int xfs_attr_inactive(struct xfs_inode *dp); +int xfs_attr_list_int(struct xfs_attr_list_context *); +int xfs_inode_hasattr(struct xfs_inode *ip); +int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name, + unsigned char *value, int *valuelenp, int flags); +int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name, + unsigned char *value, int valuelen, int flags); +int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); +int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, + int flags, struct attrlist_cursor_kern *cursor); -int xfs_attr_shortform_getvalue(struct xfs_da_args *); -int xfs_attr_fetch(struct xfs_inode *, const char *, int, - char *, int *, int, struct cred *); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c new file mode 100644 index 00000000000..09480c57f06 --- /dev/null +++ b/fs/xfs/xfs_attr_inactive.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_attr_remote.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + +/* + * Look at all the extents for this logical region, + * invalidate any buffers that are incore/in transactions. + */ +STATIC int +xfs_attr3_leaf_freextent( + struct xfs_trans **trans, + struct xfs_inode *dp, + xfs_dablk_t blkno, + int blkcnt) +{ + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_dablk_t tblkno; + xfs_daddr_t dblkno; + int tblkcnt; + int dblkcnt; + int nmap; + int error; + + /* + * Roll through the "value", invalidating the attribute value's + * blocks. + */ + tblkno = blkno; + tblkcnt = blkcnt; + while (tblkcnt > 0) { + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, + &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) { + return(error); + } + ASSERT(nmap == 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + + /* + * If it's a hole, these are already unmapped + * so there's nothing to invalidate. + */ + if (map.br_startblock != HOLESTARTBLOCK) { + + dblkno = XFS_FSB_TO_DADDR(dp->i_mount, + map.br_startblock); + dblkcnt = XFS_FSB_TO_BB(dp->i_mount, + map.br_blockcount); + bp = xfs_trans_get_buf(*trans, + dp->i_mount->m_ddev_targp, + dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + xfs_trans_binval(*trans, bp); + /* + * Roll to next transaction. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return (error); + } + + tblkno += map.br_blockcount; + tblkcnt -= map.br_blockcount; + } + + return(0); +} + +/* + * Invalidate all of the "remote" value regions pointed to by a particular + * leaf block. + * Note that we must release the lock on the buffer so that we are not + * caught holding something that the logging code wants to flush to disk. + */ +STATIC int +xfs_attr3_leaf_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_attr_inactive_list *list; + struct xfs_attr_inactive_list *lp; + int error; + int count; + int size; + int tmp; + int i; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + /* + * Count the number of "remote" value extents. + */ + count = 0; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) + count++; + } + } + + /* + * If there are no "remote" values, we're done. + */ + if (count == 0) { + xfs_trans_brelse(*trans, bp); + return 0; + } + + /* + * Allocate storage for a list of all the "remote" value extents. + */ + size = count * sizeof(xfs_attr_inactive_list_t); + list = kmem_alloc(size, KM_SLEEP); + + /* + * Identify each of the "remote" value extents. + */ + lp = list; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = 0; i < ichdr.count; entry++, i++) { + if (be16_to_cpu(entry->nameidx) && + ((entry->flags & XFS_ATTR_LOCAL) == 0)) { + name_rmt = xfs_attr3_leaf_name_remote(leaf, i); + if (name_rmt->valueblk) { + lp->valueblk = be32_to_cpu(name_rmt->valueblk); + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, + be32_to_cpu(name_rmt->valuelen)); + lp++; + } + } + } + xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */ + + /* + * Invalidate each of the "remote" value extents. + */ + error = 0; + for (lp = list, i = 0; i < count; i++, lp++) { + tmp = xfs_attr3_leaf_freextent(trans, dp, + lp->valueblk, lp->valuelen); + + if (error == 0) + error = tmp; /* save only the 1st errno */ + } + + kmem_free(list); + return error; +} + +/* + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +STATIC int +xfs_attr3_node_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp, + struct xfs_buf *bp, + int level) +{ + xfs_da_blkinfo_t *info; + xfs_da_intnode_t *node; + xfs_dablk_t child_fsb; + xfs_daddr_t parent_blkno, child_blkno; + int error, i; + struct xfs_buf *child_bp; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr ichdr; + + /* + * Since this code is recursive (gasp!) we must protect ourselves. + */ + if (level > XFS_DA_NODE_MAXDEPTH) { + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + return XFS_ERROR(EIO); + } + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&ichdr, node); + parent_blkno = bp->b_bn; + if (!ichdr.count) { + xfs_trans_brelse(*trans, bp); + return 0; + } + btree = dp->d_ops->node_tree_p(node); + child_fsb = be32_to_cpu(btree[0].before); + xfs_trans_brelse(*trans, bp); /* no locks for later trans */ + + /* + * If this is the node level just above the leaves, simply loop + * over the leaves removing all of them. If this is higher up + * in the tree, recurse downward. + */ + for (i = 0; i < ichdr.count; i++) { + /* + * Read the subsidiary block to see what we have to work with. + * Don't do this in a transaction. This is a depth-first + * traversal of the tree so we may deal with many blocks + * before we come back to this one. + */ + error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, + XFS_ATTR_FORK); + if (error) + return(error); + if (child_bp) { + /* save for re-read later */ + child_blkno = XFS_BUF_ADDR(child_bp); + + /* + * Invalidate the subtree, however we have to. + */ + info = child_bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, + child_bp, level + 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, + child_bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, child_bp); + break; + } + if (error) + return error; + + /* + * Remove the subsidiary block from the cache + * and from the log. + */ + error = xfs_da_get_buf(*trans, dp, 0, child_blkno, + &child_bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, child_bp); + } + + /* + * If we're not done, re-read the parent to get the next + * child block number. + */ + if (i + 1 < ichdr.count) { + error = xfs_da3_node_read(*trans, dp, 0, parent_blkno, + &bp, XFS_ATTR_FORK); + if (error) + return error; + child_fsb = be32_to_cpu(btree[i + 1].before); + xfs_trans_brelse(*trans, bp); + } + /* + * Atomically commit the whole invalidate stuff. + */ + error = xfs_trans_roll(trans, dp); + if (error) + return error; + } + + return 0; +} + +/* + * Indiscriminately delete the entire attribute fork + * + * Recurse (gasp!) through the attribute nodes until we find leaves. + * We're doing a depth-first traversal in order to invalidate everything. + */ +int +xfs_attr3_root_inactive( + struct xfs_trans **trans, + struct xfs_inode *dp) +{ + struct xfs_da_blkinfo *info; + struct xfs_buf *bp; + xfs_daddr_t blkno; + int error; + + /* + * Read block 0 to see what we have to work with. + * We only get here if we have extents, since we remove + * the extents in reverse order the extent containing + * block 0 must still be there. + */ + error = xfs_da3_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); + if (error) + return error; + blkno = bp->b_bn; + + /* + * Invalidate the tree, even if the "tree" is only a single leaf block. + * This is a depth-first traversal! + */ + info = bp->b_addr; + switch (info->magic) { + case cpu_to_be16(XFS_DA_NODE_MAGIC): + case cpu_to_be16(XFS_DA3_NODE_MAGIC): + error = xfs_attr3_node_inactive(trans, dp, bp, 1); + break; + case cpu_to_be16(XFS_ATTR_LEAF_MAGIC): + case cpu_to_be16(XFS_ATTR3_LEAF_MAGIC): + error = xfs_attr3_leaf_inactive(trans, dp, bp); + break; + default: + error = XFS_ERROR(EIO); + xfs_trans_brelse(*trans, bp); + break; + } + if (error) + return error; + + /* + * Invalidate the incore copy of the root block. + */ + error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); + if (error) + return error; + xfs_trans_binval(*trans, bp); /* remove from cache */ + /* + * Commit the invalidate and start the next transaction. + */ + error = xfs_trans_roll(trans, dp); + + return error; +} + +int +xfs_attr_inactive(xfs_inode_t *dp) +{ + xfs_trans_t *trans; + xfs_mount_t *mp; + int error; + + mp = dp->i_mount; + ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, XFS_ILOCK_SHARED); + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + xfs_iunlock(dp, XFS_ILOCK_SHARED); + return 0; + } + xfs_iunlock(dp, XFS_ILOCK_SHARED); + + /* + * Start our first transaction of the day. + * + * All future transactions during this code must be "chained" off + * this one via the trans_dup() call. All transactions will contain + * the inode, and the inode will always be marked with trans_ihold(). + * Since the inode will be locked in all transactions, we must log + * the inode in every transaction to let it float upward through + * the log. + */ + trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); + error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); + if (error) { + xfs_trans_cancel(trans, 0); + return(error); + } + xfs_ilock(dp, XFS_ILOCK_EXCL); + + /* + * No need to make quota reservations here. We expect to release some + * blocks, not allocate, in the common case. + */ + xfs_trans_ijoin(trans, dp, 0); + + /* + * Decide on what work routines to call based on the inode size. + */ + if (!xfs_inode_hasattr(dp) || + dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = 0; + goto out; + } + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out; + + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out; + + error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + + return(error); + +out: + xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return(error); +} diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 9455051f012..28712d29e43 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,31 +18,32 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc.h" -#include "xfs_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + /* * xfs_attr_leaf.c @@ -56,42 +58,233 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block, - xfs_dabuf_t **bpp); -STATIC int xfs_attr_leaf_add_work(xfs_dabuf_t *leaf_buffer, xfs_da_args_t *args, - int freemap_index); -STATIC void xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *leaf_buffer); -STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state, +STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); +STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_buf *leaf_buffer); +STATIC void xfs_attr3_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -STATIC int xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *leaf_blk_1, - xfs_da_state_blk_t *leaf_blk_2, - int *number_entries_in_blk1, - int *number_usedbytes_in_blk1); +STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state, + xfs_da_state_blk_t *leaf_blk_1, + struct xfs_attr3_icleaf_hdr *ichdr1, + xfs_da_state_blk_t *leaf_blk_2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *number_entries_in_blk1, + int *number_usedbytes_in_blk1); /* - * Routines used for shrinking the Btree. + * Utility routines. */ -STATIC int xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp, int level); -STATIC int xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dabuf_t *bp); -STATIC int xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt); +STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, + struct xfs_attr_leafblock *src_leaf, + struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start, + struct xfs_attr_leafblock *dst_leaf, + struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start, + int move_count); +STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); + +void +xfs_attr3_leaf_hdr_from_disk( + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + int i; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->usedbytes = be16_to_cpu(hdr3->usedbytes); + to->firstused = be16_to_cpu(hdr3->firstused); + to->holes = hdr3->holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(hdr3->freemap[i].base); + to->freemap[i].size = be16_to_cpu(hdr3->freemap[i].size); + } + return; + } + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->usedbytes = be16_to_cpu(from->hdr.usedbytes); + to->firstused = be16_to_cpu(from->hdr.firstused); + to->holes = from->hdr.holes; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->freemap[i].base = be16_to_cpu(from->hdr.freemap[i].base); + to->freemap[i].size = be16_to_cpu(from->hdr.freemap[i].size); + } +} + +void +xfs_attr3_leaf_hdr_to_disk( + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + int i; + + ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || + from->magic == XFS_ATTR3_LEAF_MAGIC); + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + struct xfs_attr3_leaf_hdr *hdr3 = (struct xfs_attr3_leaf_hdr *)to; + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->usedbytes = cpu_to_be16(from->usedbytes); + hdr3->firstused = cpu_to_be16(from->firstused); + hdr3->holes = from->holes; + hdr3->pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + hdr3->freemap[i].base = cpu_to_be16(from->freemap[i].base); + hdr3->freemap[i].size = cpu_to_be16(from->freemap[i].size); + } + return; + } + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.usedbytes = cpu_to_be16(from->usedbytes); + to->hdr.firstused = cpu_to_be16(from->firstused); + to->hdr.holes = from->holes; + to->hdr.pad1 = 0; + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + to->hdr.freemap[i].base = cpu_to_be16(from->freemap[i].base); + to->hdr.freemap[i].size = cpu_to_be16(from->freemap[i].size); + } +} + +static bool +xfs_attr3_leaf_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_attr3_icleaf_hdr ichdr; + + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_ATTR3_LEAF_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) + return false; + } + if (ichdr.count == 0) + return false; + + /* XXX: need to range check rest of attr header values */ + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_attr3_leaf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_attr3_leaf_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF); +} /* - * Utility routines. + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. */ -STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf, - int src_start, - xfs_attr_leafblock_t *dst_leaf, - int dst_start, int move_count, - xfs_mount_t *mp); -STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); -STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *, char *name, int namelen, - int valuelen); +static void +xfs_attr3_leaf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_attr3_leaf_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .verify_read = xfs_attr3_leaf_read_verify, + .verify_write = xfs_attr3_leaf_write_verify, +}; + +int +xfs_attr3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + XFS_ATTR_FORK, &xfs_attr3_leaf_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_ATTR_LEAF_BUF); + return err; +} + +/*======================================================================== + * Namespace helper routines + *========================================================================*/ + +/* + * If namespace bits don't match return 0. + * If all match then return 1. + */ +STATIC int +xfs_attr_namesp_match(int arg_flags, int ondisk_flags) +{ + return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags); +} /*======================================================================== @@ -101,6 +294,7 @@ STATIC int xfs_attr_put_listent(xfs_attr_list_context_t *context, /* * Query whether the requested number of additional bytes of extended * attribute space will be able to fit inline. + * * Returns zero if not, else the di_forkoff fork offset to be used in the * literal area for attribute data once the new bytes have been added. * @@ -113,9 +307,11 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) int offset; int minforkoff; /* lower limit on valid forkoff locations */ int maxforkoff; /* upper limit on valid forkoff locations */ + int dsize; xfs_mount_t *mp = dp->i_mount; - offset = (XFS_LITINO(mp) - bytes) >> 3; /* rounded down */ + /* rounded down */ + offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3; switch (dp->i_d.di_format) { case XFS_DINODE_FMT_DEV: @@ -126,24 +322,74 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) return (offset >= minforkoff) ? minforkoff : 0; } - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { - if (bytes <= XFS_IFORK_ASIZE(dp)) - return mp->m_attroffset >> 3; + /* + * If the requested numbers of bytes is smaller or equal to the + * current attribute fork size we can always proceed. + * + * Note that if_bytes in the data fork might actually be larger than + * the current data fork size is due to delalloc extents. In that + * case either the extent count will go down when they are converted + * to real extents, or the delalloc conversion will take care of the + * literal area rebalancing. + */ + if (bytes <= XFS_IFORK_ASIZE(dp)) + return dp->i_d.di_forkoff; + + /* + * For attr2 we can try to move the forkoff if there is space in the + * literal area, but for the old format we are done if there is no + * space in the fixed attribute fork. + */ + if (!(mp->m_flags & XFS_MOUNT_ATTR2)) return 0; + + dsize = dp->i_df.if_bytes; + + switch (dp->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* + * If there is no attr fork and the data fork is extents, + * determine if creating the default attr fork will result + * in the extents form migrating to btree. If so, the + * minimum offset only needs to be the space required for + * the btree root. + */ + if (!dp->i_d.di_forkoff && dp->i_df.if_bytes > + xfs_default_attroffset(dp)) + dsize = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + break; + case XFS_DINODE_FMT_BTREE: + /* + * If we have a data btree then keep forkoff if we have one, + * otherwise we are adding a new attr, so then we set + * minforkoff to where the btree root can finish so we have + * plenty of room for attrs + */ + if (dp->i_d.di_forkoff) { + if (offset < dp->i_d.di_forkoff) + return 0; + return dp->i_d.di_forkoff; + } + dsize = XFS_BMAP_BROOT_SPACE(mp, dp->i_df.if_broot); + break; } - /* data fork btree root can have at least this many key/ptr pairs */ - minforkoff = MAX(dp->i_df.if_bytes, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); + /* + * A data fork btree root must have space for at least + * MINDBTPTRS key/ptr pairs if the data fork is small or empty. + */ + minforkoff = MAX(dsize, XFS_BMDR_SPACE_CALC(MINDBTPTRS)); minforkoff = roundup(minforkoff, 8) >> 3; /* attr fork btree root can have at least this many key/ptr pairs */ - maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS); + maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); maxforkoff = maxforkoff >> 3; /* rounded down */ - if (offset >= minforkoff && offset < maxforkoff) - return offset; if (offset >= maxforkoff) return maxforkoff; + if (offset >= minforkoff) + return offset; return 0; } @@ -153,17 +399,15 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes) STATIC void xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) { - unsigned long s; - if ((mp->m_flags & XFS_MOUNT_ATTR2) && - !(XFS_SB_VERSION_HASATTR2(&mp->m_sb))) { - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { - XFS_SB_VERSION_ADDATTR2(&mp->m_sb); - XFS_SB_UNLOCK(mp, s); + !(xfs_sb_version_hasattr2(&mp->m_sb))) { + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr2(&mp->m_sb)) { + xfs_sb_version_addattr2(&mp->m_sb); + spin_unlock(&mp->m_sb_lock); xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); } else - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); } } @@ -177,6 +421,8 @@ xfs_attr_shortform_create(xfs_da_args_t *args) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_create(args); + dp = args->dp; ASSERT(dp != NULL); ifp = dp->i_afp; @@ -210,13 +456,11 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) xfs_inode_t *dp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_add(args); + dp = args->dp; mp = dp->i_mount; dp->i_d.di_forkoff = forkoff; - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); ifp = dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); @@ -228,11 +472,7 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; ASSERT(0); #endif @@ -246,18 +486,36 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) sfe->namelen = args->namelen; sfe->valuelen = args->valuelen; - sfe->flags = (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); + sfe->flags = XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); memcpy(sfe->nameval, args->name, args->namelen); memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen); sf->hdr.count++; - be16_add(&sf->hdr.totsize, size); + be16_add_cpu(&sf->hdr.totsize, size); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); xfs_sbversion_add_attr2(mp, args->trans); } /* + * After the last attribute is removed revert to original inode format, + * making all literal area available to the data fork once more. + */ +STATIC void +xfs_attr_fork_reset( + struct xfs_inode *ip, + struct xfs_trans *tp) +{ + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + ip->i_d.di_forkoff = 0; + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + + ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_afp == NULL); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +} + +/* * Remove an attribute from the shortform attribute list structure. */ int @@ -269,6 +527,8 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) xfs_mount_t *mp; xfs_inode_t *dp; + trace_xfs_attr_sf_remove(args); + dp = args->dp; mp = dp->i_mount; base = sizeof(xfs_attr_sf_hdr_t); @@ -282,11 +542,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) continue; if (memcmp(sfe->nameval, args->name, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; break; } @@ -301,37 +557,25 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) if (end != totsize) memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end); sf->hdr.count--; - be16_add(&sf->hdr.totsize, -size); + be16_add_cpu(&sf->hdr.totsize, -size); /* * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname && - (mp->m_flags & XFS_MOUNT_ATTR2)) { - /* - * Last attribute now removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + if (totsize == sizeof(xfs_attr_sf_hdr_t) && + (mp->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && + !(args->op_flags & XFS_DA_OP_ADDNAME)) { + xfs_attr_fork_reset(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); ASSERT(dp->i_d.di_forkoff); - ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname || - !(mp->m_flags & XFS_MOUNT_ATTR2)); - dp->i_afp->if_ext_max = - XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || + (args->op_flags & XFS_DA_OP_ADDNAME) || + !(mp->m_flags & XFS_MOUNT_ATTR2) || + dp->i_d.di_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); } @@ -353,6 +597,8 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) int i; xfs_ifork_t *ifp; + trace_xfs_attr_sf_lookup(args); + ifp = args->dp->i_afp; ASSERT(ifp->if_flags & XFS_IFINLINE); sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -363,11 +609,7 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; return(XFS_ERROR(EEXIST)); } @@ -385,7 +627,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) xfs_attr_sf_entry_t *sfe; int i; - ASSERT(args->dp->i_d.di_aformat == XFS_IFINLINE); + ASSERT(args->dp->i_afp->if_flags == XFS_IFINLINE); sf = (xfs_attr_shortform_t *)args->dp->i_afp->if_u1.if_data; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; @@ -394,11 +636,7 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) continue; if (memcmp(args->name, sfe->nameval, args->namelen) != 0) continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0)) - continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, sfe->flags)) continue; if (args->flags & ATTR_KERNOVAL) { args->valuelen = sfe->valuelen; @@ -429,9 +667,11 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) char *tmpbuffer; int error, i, size; xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_buf *bp; xfs_ifork_t *ifp; + trace_xfs_attr_sf_to_leaf(args); + dp = args->dp; ifp = dp->i_afp; sf = (xfs_attr_shortform_t *)ifp->if_u1.if_data; @@ -442,6 +682,8 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + bp = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -457,7 +699,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) } ASSERT(blkno == 0); - error = xfs_attr_leaf_create(args, blkno, &bp); + error = xfs_attr3_leaf_create(args, blkno, &bp); if (error) { error = xfs_da_shrink_inode(args, 0, bp); bp = NULL; @@ -470,26 +712,26 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; + nargs.geo = args->geo; nargs.firstblock = args->firstblock; nargs.flist = args->flist; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; + nargs.op_flags = XFS_DA_OP_OKNOENT; sfe = &sf->list[0]; for (i = 0; i < sf->hdr.count; i++) { - nargs.name = (char *)sfe->nameval; + nargs.name = sfe->nameval; nargs.namelen = sfe->namelen; - nargs.value = (char *)&sfe->nameval[nargs.namelen]; + nargs.value = &sfe->nameval[nargs.namelen]; nargs.valuelen = sfe->valuelen; - nargs.hashval = xfs_da_hashname((char *)sfe->nameval, + nargs.hashval = xfs_da_hashname(sfe->nameval, sfe->namelen); - nargs.flags = (sfe->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((sfe->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); - error = xfs_attr_leaf_lookup_int(bp, &nargs); /* set a->index */ + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ ASSERT(error == ENOATTR); - error = xfs_attr_leaf_add(bp, &nargs); + error = xfs_attr3_leaf_add(bp, &nargs); ASSERT(error != ENOSPC); if (error) goto out; @@ -498,265 +740,85 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) error = 0; out: - if(bp) - xfs_da_buf_done(bp); - kmem_free(tmpbuffer, size); + kmem_free(tmpbuffer); return(error); } -STATIC int -xfs_attr_shortform_compare(const void *a, const void *b) -{ - xfs_attr_sf_sort_t *sa, *sb; - - sa = (xfs_attr_sf_sort_t *)a; - sb = (xfs_attr_sf_sort_t *)b; - if (sa->hash < sb->hash) { - return(-1); - } else if (sa->hash > sb->hash) { - return(1); - } else { - return(sa->entno - sb->entno); - } -} - -/* - * Copy out entries of shortform attribute lists for attr_list(). - * Shortform attribute lists are not stored in hashval sorted order. - * If the output buffer is not large enough to hold them all, then we - * we have to calculate each entries' hashvalue and sort them before - * we can begin returning them to the user. - */ -/*ARGSUSED*/ -int -xfs_attr_shortform_list(xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_sf_sort_t *sbuf, *sbp; - xfs_attr_shortform_t *sf; - xfs_attr_sf_entry_t *sfe; - xfs_inode_t *dp; - int sbsize, nsbuf, count, i; - - ASSERT(context != NULL); - dp = context->dp; - ASSERT(dp != NULL); - ASSERT(dp->i_afp != NULL); - sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; - ASSERT(sf != NULL); - if (!sf->hdr.count) - return(0); - cursor = context->cursor; - ASSERT(cursor != NULL); - - xfs_attr_trace_l_c("sf start", context); - - /* - * If the buffer is large enough, do not bother with sorting. - * Note the generous fudge factor of 16 overhead bytes per entry. - */ - if ((dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize) { - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - attrnames_t *namesp; - - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - namesp = (sfe->flags & XFS_ATTR_SECURE) ? &attr_secure: - ((sfe->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sfe->namelen + 1; - } - else { - if (xfs_attr_put_listent(context, namesp, - (char *)sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen)) - break; - } - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - } - xfs_attr_trace_l_c("sf big-gulp", context); - return(0); - } - - /* - * It didn't all fit, so we have to sort everything on hashval. - */ - sbsize = sf->hdr.count * sizeof(*sbuf); - sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); - - /* - * Scan the attribute list for the rest of the entries, storing - * the relevant info from only those that match into a buffer. - */ - nsbuf = 0; - for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - if (unlikely( - ((char *)sfe < (char *)sf) || - ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { - XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", - XFS_ERRLEVEL_LOW, - context->dp->i_mount, sfe); - xfs_attr_trace_l_c("sf corrupted", context); - kmem_free(sbuf, sbsize); - return XFS_ERROR(EFSCORRUPTED); - } - if (((context->flags & ATTR_SECURE) != 0) != - ((sfe->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - if (((context->flags & ATTR_ROOT) != 0) != - ((sfe->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) { - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - continue; - } - sbp->entno = i; - sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen); - sbp->name = (char *)sfe->nameval; - sbp->namelen = sfe->namelen; - /* These are bytes, and both on-disk, don't endian-flip */ - sbp->valuelen = sfe->valuelen; - sbp->flags = sfe->flags; - sfe = XFS_ATTR_SF_NEXTENTRY(sfe); - sbp++; - nsbuf++; - } - - /* - * Sort the entries on hash then entno. - */ - xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); - - /* - * Re-find our place IN THE SORTED LIST. - */ - count = 0; - cursor->initted = 1; - cursor->blkno = 0; - for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { - if (sbp->hash == cursor->hashval) { - if (cursor->offset == count) { - break; - } - count++; - } else if (sbp->hash > cursor->hashval) { - break; - } - } - if (i == nsbuf) { - kmem_free(sbuf, sbsize); - xfs_attr_trace_l_c("blk end", context); - return(0); - } - - /* - * Loop putting entries into the user buffer. - */ - for ( ; i < nsbuf; i++, sbp++) { - attrnames_t *namesp; - - namesp = (sbp->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((sbp->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - - if (cursor->hashval != sbp->hash) { - cursor->hashval = sbp->hash; - cursor->offset = 0; - } - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - sbp->namelen + 1; - } else { - if (xfs_attr_put_listent(context, namesp, - sbp->name, sbp->namelen, - sbp->valuelen)) - break; - } - cursor->offset++; - } - - kmem_free(sbuf, sbsize); - xfs_attr_trace_l_c("sf E-O-F", context); - return(0); -} - /* * Check a leaf attribute block to see if all the entries would fit into * a shortform attribute list. */ int -xfs_attr_shortform_allfit(xfs_dabuf_t *bp, xfs_inode_t *dp) +xfs_attr_shortform_allfit( + struct xfs_buf *bp, + struct xfs_inode *dp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; xfs_attr_leaf_name_local_t *name_loc; - int bytes, i; + struct xfs_attr3_icleaf_hdr leafhdr; + int bytes; + int i; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); - entry = &leaf->entries[0]; bytes = sizeof(struct xfs_attr_sf_hdr); - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + for (i = 0; i < leafhdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!(entry->flags & XFS_ATTR_LOCAL)) return(0); - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); + name_loc = xfs_attr3_leaf_name_local(leaf, i); if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) return(0); - bytes += sizeof(struct xfs_attr_sf_entry)-1 + bytes += sizeof(struct xfs_attr_sf_entry) - 1 + name_loc->namelen + be16_to_cpu(name_loc->valuelen); } if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) - return(-1); - return(xfs_attr_shortform_bytesfit(dp, bytes)); + return -1; + return xfs_attr_shortform_bytesfit(dp, bytes); } /* * Convert a leaf attribute list to shortform attribute list */ int -xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) +xfs_attr3_leaf_to_shortform( + struct xfs_buf *bp, + struct xfs_da_args *args, + int forkoff) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_da_args_t nargs; - xfs_inode_t *dp; - char *tmpbuffer; - int error, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_da_args nargs; + struct xfs_inode *dp = args->dp; + char *tmpbuffer; + int error; + int i; - dp = args->dp; - tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP); - ASSERT(tmpbuffer != NULL); + trace_xfs_attr_leaf_to_sf(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + if (!tmpbuffer) + return ENOMEM; + + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); - ASSERT(bp != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount)); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - memset(bp->data, 0, XFS_LBSIZE(dp->i_mount)); + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entry = xfs_attr3_leaf_entryp(leaf); + + /* XXX (dgc): buffer is about to be marked stale - why zero it? */ + memset(bp->b_addr, 0, args->geo->blksize); /* * Clean out the prior contents of the attribute list. @@ -767,20 +829,8 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); - - /* - * Last attribute was removed, revert to original - * inode format making all literal area available - * to the data fork once more. - */ - xfs_idestroy_fork(dp, XFS_ATTR_FORK); - dp->i_d.di_forkoff = 0; - dp->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - ASSERT(dp->i_d.di_anextents == 0); - ASSERT(dp->i_afp == NULL); - dp->i_df.if_ext_max = - XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t); - xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); + ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); + xfs_attr_fork_reset(dp, args->trans); goto out; } @@ -790,96 +840,105 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff) * Copy the attributes */ memset((char *)&nargs, 0, sizeof(nargs)); + nargs.geo = args->geo; nargs.dp = dp; nargs.firstblock = args->firstblock; nargs.flist = args->flist; nargs.total = args->total; nargs.whichfork = XFS_ATTR_FORK; nargs.trans = args->trans; - nargs.oknoent = 1; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { + nargs.op_flags = XFS_DA_OP_OKNOENT; + + for (i = 0; i < ichdr.count; entry++, i++) { if (entry->flags & XFS_ATTR_INCOMPLETE) continue; /* don't copy partial entries */ if (!entry->nameidx) continue; ASSERT(entry->flags & XFS_ATTR_LOCAL); - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); - nargs.name = (char *)name_loc->nameval; + name_loc = xfs_attr3_leaf_name_local(leaf, i); + nargs.name = name_loc->nameval; nargs.namelen = name_loc->namelen; - nargs.value = (char *)&name_loc->nameval[nargs.namelen]; + nargs.value = &name_loc->nameval[nargs.namelen]; nargs.valuelen = be16_to_cpu(name_loc->valuelen); nargs.hashval = be32_to_cpu(entry->hashval); - nargs.flags = (entry->flags & XFS_ATTR_SECURE) ? ATTR_SECURE : - ((entry->flags & XFS_ATTR_ROOT) ? ATTR_ROOT : 0); + nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(entry->flags); xfs_attr_shortform_add(&nargs, forkoff); } error = 0; out: - kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount)); - return(error); + kmem_free(tmpbuffer); + return error; } /* * Convert from using a single leaf to a root node and a leaf. */ int -xfs_attr_leaf_to_node(xfs_da_args_t *args) +xfs_attr3_leaf_to_node( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_da_intnode_t *node; - xfs_inode_t *dp; - xfs_dabuf_t *bp1, *bp2; - xfs_dablk_t blkno; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr icleafhdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr icnodehdr; + struct xfs_da_intnode *node; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp1 = NULL; + struct xfs_buf *bp2 = NULL; + xfs_dablk_t blkno; + int error; + + trace_xfs_attr_leaf_to_node(args); - dp = args->dp; - bp1 = bp2 = NULL; error = xfs_da_grow_inode(args, &blkno); if (error) goto out; - error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1, - XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(args->trans, dp, 0, -1, &bp1); if (error) goto out; - ASSERT(bp1 != NULL); - bp2 = NULL; - error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2, - XFS_ATTR_FORK); + + error = xfs_da_get_buf(args->trans, dp, blkno, -1, &bp2, XFS_ATTR_FORK); if (error) goto out; - ASSERT(bp2 != NULL); - memcpy(bp2->data, bp1->data, XFS_LBSIZE(dp->i_mount)); - xfs_da_buf_done(bp1); - bp1 = NULL; - xfs_da_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1); + + /* copy leaf to new buffer, update identifiers */ + xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); + bp2->b_ops = bp1->b_ops; + memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; + hdr3->blkno = cpu_to_be64(bp2->b_bn); + } + xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); /* * Set up the new root node. */ - error = xfs_da_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); + error = xfs_da3_node_create(args, 0, 1, &bp1, XFS_ATTR_FORK); if (error) goto out; - node = bp1->data; - leaf = bp2->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + node = bp1->b_addr; + dp->d_ops->node_hdr_from_disk(&icnodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + leaf = bp2->b_addr; + xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + /* both on-disk, don't endian-flip twice */ - node->btree[0].hashval = - leaf->entries[be16_to_cpu(leaf->hdr.count)-1 ].hashval; - node->btree[0].before = cpu_to_be32(blkno); - node->hdr.count = cpu_to_be16(1); - xfs_da_log_buf(args->trans, bp1, 0, XFS_LBSIZE(dp->i_mount) - 1); + btree[0].hashval = entries[icleafhdr.count - 1].hashval; + btree[0].before = cpu_to_be32(blkno); + icnodehdr.count = 1; + dp->d_ops->node_hdr_to_disk(node, &icnodehdr); + xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1); error = 0; out: - if (bp1) - xfs_da_buf_done(bp1); - if (bp2) - xfs_da_buf_done(bp2); - return(error); + return error; } - /*======================================================================== * Routines used for growing the Btree. *========================================================================*/ @@ -889,51 +948,69 @@ out: * or a leaf in a node attribute list. */ STATIC int -xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t blkno, xfs_dabuf_t **bpp) +xfs_attr3_leaf_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + struct xfs_buf **bpp) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_inode_t *dp; - xfs_dabuf_t *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + trace_xfs_attr_leaf_create(args); - dp = args->dp; - ASSERT(dp != NULL); error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp, XFS_ATTR_FORK); if (error) - return(error); - ASSERT(bp != NULL); - leaf = bp->data; - memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount)); - hdr = &leaf->hdr; - hdr->info.magic = cpu_to_be16(XFS_ATTR_LEAF_MAGIC); - hdr->firstused = cpu_to_be16(XFS_LBSIZE(dp->i_mount)); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - XFS_LBSIZE(dp->i_mount) - XFS_ATTR_LEAF_NAME_ALIGN); - } + return error; + bp->b_ops = &xfs_attr3_leaf_buf_ops; + xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF); + leaf = bp->b_addr; + memset(leaf, 0, args->geo->blksize); - hdr->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memset(&ichdr, 0, sizeof(ichdr)); + ichdr.firstused = args->geo->blksize; - xfs_da_log_buf(args->trans, bp, 0, XFS_LBSIZE(dp->i_mount) - 1); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_blkinfo *hdr3 = bp->b_addr; + + ichdr.magic = XFS_ATTR3_LEAF_MAGIC; + + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + ichdr.freemap[0].base = sizeof(struct xfs_attr3_leaf_hdr); + } else { + ichdr.magic = XFS_ATTR_LEAF_MAGIC; + ichdr.freemap[0].base = sizeof(struct xfs_attr_leaf_hdr); + } + ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; + + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; - return(0); + return 0; } /* * Split the leaf node, rebalance, then add the new entry. */ int -xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_attr3_leaf_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { xfs_dablk_t blkno; int error; + trace_xfs_attr_leaf_split(state->args); + /* * Allocate space for a new leaf node. */ @@ -941,7 +1018,7 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, error = xfs_da_grow_inode(state->args, &blkno); if (error) return(error); - error = xfs_attr_leaf_create(state->args, blkno, &newblk->bp); + error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); if (error) return(error); newblk->blkno = blkno; @@ -951,8 +1028,8 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Rebalance the entries across the two leaves. * NOTE: rebalance() currently depends on the 2nd block being empty. */ - xfs_attr_leaf_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_attr3_leaf_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); @@ -963,10 +1040,13 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * Insert the "new" entry in the correct block. */ - if (state->inleaf) - error = xfs_attr_leaf_add(oldblk->bp, state->args); - else - error = xfs_attr_leaf_add(newblk->bp, state->args); + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); + error = xfs_attr3_leaf_add(oldblk->bp, state->args); + } else { + trace_xfs_attr_leaf_add_new(state->args); + error = xfs_attr3_leaf_add(newblk->bp, state->args); + } /* * Update last hashval in each block since we added the name. @@ -980,44 +1060,46 @@ xfs_attr_leaf_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Add a name to the leaf attribute list structure. */ int -xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr3_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - int tablesize, entsize, sum, tmp, i; - - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT((args->index >= 0) - && (args->index <= be16_to_cpu(leaf->hdr.count))); - hdr = &leaf->hdr; - entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - args->trans->t_mountp->m_sb.sb_blocksize, NULL); + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; + int sum; + int tmp; + int i; + + trace_xfs_attr_leaf_add(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index >= 0 && args->index <= ichdr.count); + entsize = xfs_attr_leaf_newentsize(args, NULL); /* * Search through freemap for first-fit on new name length. * (may need to figure in size of entry struct too) */ - tablesize = (be16_to_cpu(hdr->count) + 1) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[XFS_ATTR_LEAF_MAPSIZE-1]; - for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE-1; i >= 0; map--, i--) { - if (tablesize > be16_to_cpu(hdr->firstused)) { - sum += be16_to_cpu(map->size); + tablesize = (ichdr.count + 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + for (sum = 0, i = XFS_ATTR_LEAF_MAPSIZE - 1; i >= 0; i--) { + if (tablesize > ichdr.firstused) { + sum += ichdr.freemap[i].size; continue; } - if (!map->size) + if (!ichdr.freemap[i].size) continue; /* no space in this map */ tmp = entsize; - if (be16_to_cpu(map->base) < be16_to_cpu(hdr->firstused)) + if (ichdr.freemap[i].base < ichdr.firstused) tmp += sizeof(xfs_attr_leaf_entry_t); - if (be16_to_cpu(map->size) >= tmp) { - tmp = xfs_attr_leaf_add_work(bp, args, i); - return(tmp); + if (ichdr.freemap[i].size >= tmp) { + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; } - sum += be16_to_cpu(map->size); + sum += ichdr.freemap[i].size; } /* @@ -1025,98 +1107,104 @@ xfs_attr_leaf_add(xfs_dabuf_t *bp, xfs_da_args_t *args) * and we don't have enough freespace, then compaction will do us * no good and we should just give up. */ - if (!hdr->holes && (sum < entsize)) - return(XFS_ERROR(ENOSPC)); + if (!ichdr.holes && sum < entsize) + return XFS_ERROR(ENOSPC); /* * Compact the entries to coalesce free space. * This may change the hdr->count via dropping INCOMPLETE entries. */ - xfs_attr_leaf_compact(args->trans, bp); + xfs_attr3_leaf_compact(args, &ichdr, bp); /* * After compaction, the block is guaranteed to have only one * free region, in freemap[0]. If it is not big enough, give up. */ - if (be16_to_cpu(hdr->freemap[0].size) - < (entsize + sizeof(xfs_attr_leaf_entry_t))) - return(XFS_ERROR(ENOSPC)); + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { + tmp = ENOSPC; + goto out_log_hdr; + } - return(xfs_attr_leaf_add_work(bp, args, 0)); + tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + +out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); + return tmp; } /* * Add a name to a leaf attribute list structure. */ STATIC int -xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) +xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, + int mapindex) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_leaf_map_t *map; - xfs_mount_t *mp; - int tmp, i; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_mount *mp; + int tmp; + int i; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - hdr = &leaf->hdr; - ASSERT((mapindex >= 0) && (mapindex < XFS_ATTR_LEAF_MAPSIZE)); - ASSERT((args->index >= 0) && (args->index <= be16_to_cpu(hdr->count))); + trace_xfs_attr_leaf_add_work(args); + + leaf = bp->b_addr; + ASSERT(mapindex >= 0 && mapindex < XFS_ATTR_LEAF_MAPSIZE); + ASSERT(args->index >= 0 && args->index <= ichdr->count); /* * Force open some space in the entry array and fill it in. */ - entry = &leaf->entries[args->index]; - if (args->index < be16_to_cpu(hdr->count)) { - tmp = be16_to_cpu(hdr->count) - args->index; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + if (args->index < ichdr->count) { + tmp = ichdr->count - args->index; tmp *= sizeof(xfs_attr_leaf_entry_t); - memmove((char *)(entry+1), (char *)entry, tmp); - xfs_da_log_buf(args->trans, bp, + memmove(entry + 1, entry, tmp); + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); } - be16_add(&hdr->count, 1); + ichdr->count++; /* * Allocate space for the new string (at the end of the run). */ - map = &hdr->freemap[mapindex]; mp = args->trans->t_mountp; - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->base) & 0x3) == 0); - ASSERT(be16_to_cpu(map->size) >= - xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, NULL)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - ASSERT((be16_to_cpu(map->size) & 0x3) == 0); - be16_add(&map->size, - -xfs_attr_leaf_newentsize(args->namelen, args->valuelen, - mp->m_sb.sb_blocksize, &tmp)); - entry->nameidx = cpu_to_be16(be16_to_cpu(map->base) + - be16_to_cpu(map->size)); + ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0); + ASSERT(ichdr->freemap[mapindex].size >= + xfs_attr_leaf_newentsize(args, NULL)); + ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize); + ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0); + + ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp); + + entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base + + ichdr->freemap[mapindex].size); entry->hashval = cpu_to_be32(args->hashval); entry->flags = tmp ? XFS_ATTR_LOCAL : 0; - entry->flags |= (args->flags & ATTR_SECURE) ? XFS_ATTR_SECURE : - ((args->flags & ATTR_ROOT) ? XFS_ATTR_ROOT : 0); - if (args->rename) { + entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags); + if (args->op_flags & XFS_DA_OP_RENAME) { entry->flags |= XFS_ATTR_INCOMPLETE; if ((args->blkno2 == args->blkno) && (args->index2 <= args->index)) { args->index2++; } } - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); ASSERT((args->index == 0) || (be32_to_cpu(entry->hashval) >= be32_to_cpu((entry-1)->hashval))); - ASSERT((args->index == be16_to_cpu(hdr->count)-1) || + ASSERT((args->index == ichdr->count - 1) || (be32_to_cpu(entry->hashval) <= be32_to_cpu((entry+1)->hashval))); /* - * Copy the attribute name and value into the new space. - * * For "remote" attribute values, simply note that we need to * allocate space for the "remote" value. We can't actually * allocate the extents in this transaction, and we can't decide @@ -1124,14 +1212,14 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) * as part of this transaction (a split operation for example). */ if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); name_loc->namelen = args->namelen; name_loc->valuelen = cpu_to_be16(args->valuelen); memcpy((char *)name_loc->nameval, args->name, args->namelen); memcpy((char *)&name_loc->nameval[args->namelen], args->value, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->namelen = args->namelen; memcpy((char *)name_rmt->name, args->name, args->namelen); entry->flags |= XFS_ATTR_INCOMPLETE; @@ -1139,84 +1227,129 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex) name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } - xfs_da_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), xfs_attr_leaf_entsize(leaf, args->index))); /* * Update the control info for this leaf node */ - if (be16_to_cpu(entry->nameidx) < be16_to_cpu(hdr->firstused)) { - /* both on-disk, don't endian-flip twice */ - hdr->firstused = entry->nameidx; - } - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - tmp = (be16_to_cpu(hdr->count)-1) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - if (be16_to_cpu(map->base) == tmp) { - be16_add(&map->base, sizeof(xfs_attr_leaf_entry_t)); - be16_add(&map->size, - -((int)sizeof(xfs_attr_leaf_entry_t))); + if (be16_to_cpu(entry->nameidx) < ichdr->firstused) + ichdr->firstused = be16_to_cpu(entry->nameidx); + + ASSERT(ichdr->firstused >= ichdr->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf)); + tmp = (ichdr->count - 1) * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + if (ichdr->freemap[i].base == tmp) { + ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); } } - be16_add(&hdr->usedbytes, xfs_attr_leaf_entsize(leaf, args->index)); - xfs_da_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); - return(0); + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); + return 0; } /* * Garbage collect a leaf attribute list block by copying it to a new buffer. */ STATIC void -xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) +xfs_attr3_leaf_compact( + struct xfs_da_args *args, + struct xfs_attr3_icleaf_hdr *ichdr_dst, + struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; + struct xfs_trans *trans = args->trans; + char *tmpbuffer; - mp = trans->t_mountp; - tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memcpy(tmpbuffer, bp->data, XFS_LBSIZE(mp)); - memset(bp->data, 0, XFS_LBSIZE(mp)); + trace_xfs_attr_leaf_compact(args); + + tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); + memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); + memset(bp->b_addr, 0, args->geo->blksize); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->data; - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - hdr_d->info = hdr_s->info; /* struct copy */ - hdr_d->firstused = cpu_to_be16(XFS_LBSIZE(mp)); - /* handle truncation gracefully */ - if (!hdr_d->firstused) { - hdr_d->firstused = cpu_to_be16( - XFS_LBSIZE(mp) - XFS_ATTR_LEAF_NAME_ALIGN); - } - hdr_d->usedbytes = 0; - hdr_d->count = 0; - hdr_d->holes = 0; - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - sizeof(xfs_attr_leaf_hdr_t)); + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = args->geo->blksize; + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr_leaf_moveents(leaf_s, 0, leaf_d, 0, - be16_to_cpu(hdr_s->count), mp); - xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1); + xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0, + leaf_dst, ichdr_dst, 0, ichdr_src.count); + /* + * this logs the entire buffer, but the caller must write the header + * back to the buffer when it is finished modifying it. + */ + xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); + + kmem_free(tmpbuffer); +} - kmem_free(tmpbuffer, XFS_LBSIZE(mp)); +/* + * Compare two leaf blocks "order". + * Return 0 unless leaf2 should go before leaf1. + */ +static int +xfs_attr3_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_attr3_icleaf_hdr *leaf1hdr, + struct xfs_buf *leaf2_bp, + struct xfs_attr3_icleaf_hdr *leaf2hdr) +{ + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + + entries1 = xfs_attr3_leaf_entryp(leaf1_bp->b_addr); + entries2 = xfs_attr3_leaf_entryp(leaf2_bp->b_addr); + if (leaf1hdr->count > 0 && leaf2hdr->count > 0 && + ((be32_to_cpu(entries2[0].hashval) < + be32_to_cpu(entries1[0].hashval)) || + (be32_to_cpu(entries2[leaf2hdr->count - 1].hashval) < + be32_to_cpu(entries1[leaf1hdr->count - 1].hashval)))) { + return 1; + } + return 0; +} + +int +xfs_attr_leaf_order( + struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp) +{ + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } /* @@ -1232,26 +1365,38 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp) * the "new" and "old" values can end up in different blocks. */ STATIC void -xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_attr3_leaf_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_args_t *args; - xfs_da_state_blk_t *tmp_blk; - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - int count, totallen, max, space, swap; + struct xfs_da_args *args; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_attr_leaf_entry *entries1; + struct xfs_attr_leaf_entry *entries2; + int count; + int totallen; + int max; + int space; + int swap; /* * Set up environment. */ ASSERT(blk1->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(ichdr2.count == 0); args = state->args; + trace_xfs_attr_leaf_rebalance(args); + /* * Check ordering of blocks, reverse if it makes things simpler. * @@ -1259,16 +1404,23 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * second block, this code should never set "swap". */ swap = 0; - if (xfs_attr_leaf_order(blk1->bp, blk2->bp)) { + if (xfs_attr3_leaf_order(blk1->bp, &ichdr1, blk2->bp, &ichdr2)) { + struct xfs_da_state_blk *tmp_blk; + struct xfs_attr3_icleaf_hdr tmp_ichdr; + tmp_blk = blk1; blk1 = blk2; blk2 = tmp_blk; - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; + + /* struct copies to swap them rather than reconverting */ + tmp_ichdr = ichdr1; + ichdr1 = ichdr2; + ichdr2 = tmp_ichdr; + + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; swap = 1; } - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; /* * Examine entries until we reduce the absolute difference in @@ -1278,82 +1430,80 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * "inleaf" is true if the new entry should be inserted into blk1. * If "swap" is also true, then reverse the sense of "inleaf". */ - state->inleaf = xfs_attr_leaf_figure_balance(state, blk1, blk2, - &count, &totallen); + state->inleaf = xfs_attr3_leaf_figure_balance(state, blk1, &ichdr1, + blk2, &ichdr2, + &count, &totallen); if (swap) state->inleaf = !state->inleaf; /* * Move any entries required from leaf to leaf: */ - if (count < be16_to_cpu(hdr1->count)) { + if (count < ichdr1.count) { /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count = be16_to_cpu(hdr1->count) - count; - space = be16_to_cpu(hdr1->usedbytes) - totallen; + count = ichdr1.count - count; + space = ichdr1.usedbytes - totallen; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf2 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr2->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk2->bp); - } + max = ichdr2.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr2.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr2, blk2->bp); /* * Move high entries from leaf1 to low end of leaf2. */ - xfs_attr_leaf_moveents(leaf1, be16_to_cpu(hdr1->count) - count, - leaf2, 0, count, state->mp); + xfs_attr3_leaf_moveents(args, leaf1, &ichdr1, + ichdr1.count - count, leaf2, &ichdr2, 0, count); - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); - } else if (count > be16_to_cpu(hdr1->count)) { + } else if (count > ichdr1.count) { /* * I assert that since all callers pass in an empty * second buffer, this code should never execute. */ + ASSERT(0); /* * Figure the total bytes to be added to the destination leaf. */ /* number entries being moved */ - count -= be16_to_cpu(hdr1->count); - space = totallen - be16_to_cpu(hdr1->usedbytes); + count -= ichdr1.count; + space = totallen - ichdr1.usedbytes; space += count * sizeof(xfs_attr_leaf_entry_t); /* * leaf1 is the destination, compact it if it looks tight. */ - max = be16_to_cpu(hdr1->firstused) - - sizeof(xfs_attr_leaf_hdr_t); - max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t); - if (space > max) { - xfs_attr_leaf_compact(args->trans, blk1->bp); - } + max = ichdr1.firstused - xfs_attr3_leaf_hdr_size(leaf1); + max -= ichdr1.count * sizeof(xfs_attr_leaf_entry_t); + if (space > max) + xfs_attr3_leaf_compact(args, &ichdr1, blk1->bp); /* * Move low entries from leaf2 to high end of leaf1. */ - xfs_attr_leaf_moveents(leaf2, 0, leaf1, - be16_to_cpu(hdr1->count), count, state->mp); - - xfs_da_log_buf(args->trans, blk1->bp, 0, state->blocksize-1); - xfs_da_log_buf(args->trans, blk2->bp, 0, state->blocksize-1); + xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1, + ichdr1.count, count); } + xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); + xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); + /* * Copy out last hashval in each block for B-tree code. */ - blk1->hashval = be32_to_cpu( - leaf1->entries[be16_to_cpu(leaf1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu( - leaf2->entries[be16_to_cpu(leaf2->hdr.count)-1].hashval); + entries1 = xfs_attr3_leaf_entryp(leaf1); + entries2 = xfs_attr3_leaf_entryp(leaf2); + blk1->hashval = be32_to_cpu(entries1[ichdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(entries2[ichdr2.count - 1].hashval); /* * Adjust the expected index for insertion. @@ -1367,22 +1517,35 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * inserting. The index/blkno fields refer to the "old" entry, * while the index2/blkno2 fields refer to the "new" entry. */ - if (blk1->index > be16_to_cpu(leaf1->hdr.count)) { + if (blk1->index > ichdr1.count) { ASSERT(state->inleaf == 0); - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); + blk2->index = blk1->index - ichdr1.count; args->index = args->index2 = blk2->index; args->blkno = args->blkno2 = blk2->blkno; - } else if (blk1->index == be16_to_cpu(leaf1->hdr.count)) { + } else if (blk1->index == ichdr1.count) { if (state->inleaf) { args->index = blk1->index; args->blkno = blk1->blkno; args->index2 = 0; args->blkno2 = blk2->blkno; } else { - blk2->index = blk1->index - - be16_to_cpu(leaf1->hdr.count); - args->index = args->index2 = blk2->index; - args->blkno = args->blkno2 = blk2->blkno; + /* + * On a double leaf split, the original attr location + * is already stored in blkno2/index2, so don't + * overwrite it overwise we corrupt the tree. + */ + blk2->index = blk1->index - ichdr1.count; + args->index = blk2->index; + args->blkno = blk2->blkno; + if (!state->extravalid) { + /* + * set the new attr location to match the old + * one and let the higher level split code + * decide where in the leaf to place it. + */ + args->index2 = blk2->index; + args->blkno2 = blk2->blkno; + } } } else { ASSERT(state->inleaf == 1); @@ -1399,42 +1562,38 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * GROT: Do a double-split for this case? */ STATIC int -xfs_attr_leaf_figure_balance(xfs_da_state_t *state, - xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2, - int *countarg, int *usedbytesarg) +xfs_attr3_leaf_figure_balance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_attr3_icleaf_hdr *ichdr1, + struct xfs_da_state_blk *blk2, + struct xfs_attr3_icleaf_hdr *ichdr2, + int *countarg, + int *usedbytesarg) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_hdr_t *hdr1, *hdr2; - xfs_attr_leaf_entry_t *entry; - int count, max, index, totallen, half; - int lastdelta, foundit, tmp; - - /* - * Set up environment. - */ - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; - hdr1 = &leaf1->hdr; - hdr2 = &leaf2->hdr; - foundit = 0; - totallen = 0; + struct xfs_attr_leafblock *leaf1 = blk1->bp->b_addr; + struct xfs_attr_leafblock *leaf2 = blk2->bp->b_addr; + struct xfs_attr_leaf_entry *entry; + int count; + int max; + int index; + int totallen = 0; + int half; + int lastdelta; + int foundit = 0; + int tmp; /* * Examine entries until we reduce the absolute difference in * byte usage between the two blocks to a minimum. */ - max = be16_to_cpu(hdr1->count) + be16_to_cpu(hdr2->count); - half = (max+1) * sizeof(*entry); - half += be16_to_cpu(hdr1->usedbytes) + - be16_to_cpu(hdr2->usedbytes) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + max = ichdr1->count + ichdr2->count; + half = (max + 1) * sizeof(*entry); + half += ichdr1->usedbytes + ichdr2->usedbytes + + xfs_attr_leaf_newentsize(state->args, NULL); half /= 2; - lastdelta = state->blocksize; - entry = &leaf1->entries[0]; + lastdelta = state->args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf1); for (count = index = 0; count < max; entry++, index++, count++) { #define XFS_ATTR_ABS(A) (((A) < 0) ? -(A) : (A)) @@ -1443,10 +1602,7 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, */ if (count == blk1->index) { tmp = totallen + sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); if (XFS_ATTR_ABS(half - tmp) > lastdelta) break; lastdelta = XFS_ATTR_ABS(half - tmp); @@ -1457,9 +1613,9 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, /* * Wrap around into the second block if necessary. */ - if (count == be16_to_cpu(hdr1->count)) { + if (count == ichdr1->count) { leaf1 = leaf2; - entry = &leaf1->entries[0]; + entry = xfs_attr3_leaf_entryp(leaf1); index = 0; } @@ -1482,15 +1638,12 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, totallen -= count * sizeof(*entry); if (foundit) { totallen -= sizeof(*entry) + - xfs_attr_leaf_newentsize( - state->args->namelen, - state->args->valuelen, - state->blocksize, NULL); + xfs_attr_leaf_newentsize(state->args, NULL); } *countarg = count; *usedbytesarg = totallen; - return(foundit); + return foundit; } /*======================================================================== @@ -1509,14 +1662,22 @@ xfs_attr_leaf_figure_balance(xfs_da_state_t *state, * GROT: allow for INCOMPLETE entries in calculation. */ int -xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) +xfs_attr3_leaf_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_attr_leafblock_t *leaf; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, bytes, forward, error, retval, i; - xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_da_state_blk *blk; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_buf *bp; + xfs_dablk_t blkno; + int bytes; + int forward; + int error; + int retval; + int i; + + trace_xfs_attr_leaf_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. @@ -1524,14 +1685,12 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = sizeof(xfs_attr_leaf_hdr_t) + - count * sizeof(xfs_attr_leaf_entry_t) + - be16_to_cpu(leaf->hdr.usedbytes); - if (bytes > (state->blocksize >> 1)) { + leaf = blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + bytes = xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + ichdr.usedbytes; + if (bytes > (state->args->geo->blksize >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); } @@ -1542,14 +1701,14 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (ichdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (ichdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -1558,7 +1717,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) } else { *action = 2; } - return(0); + return 0; } /* @@ -1569,31 +1728,30 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * to shrink an attribute list over time. */ /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = ichdr.forw < ichdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_attr3_icleaf_hdr ichdr2; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = ichdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = ichdr.back; if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, - blkno, -1, &bp, XFS_ATTR_FORK); + error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, + blkno, -1, &bp); if (error) return(error); - ASSERT(bp != NULL); - - leaf = (xfs_attr_leafblock_t *)info; - count = be16_to_cpu(leaf->hdr.count); - bytes = state->blocksize - (state->blocksize>>2); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - count += be16_to_cpu(leaf->hdr.count); - bytes -= be16_to_cpu(leaf->hdr.usedbytes); - bytes -= count * sizeof(xfs_attr_leaf_entry_t); - bytes -= sizeof(xfs_attr_leaf_hdr_t); - xfs_da_brelse(state->args->trans, bp); + + xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2) - + ichdr.usedbytes - ichdr2.usedbytes - + ((ichdr.count + ichdr2.count) * + sizeof(xfs_attr_leaf_entry_t)) - + xfs_attr3_leaf_hdr_size(leaf); + + xfs_trans_brelse(state->args->trans, bp); if (bytes >= 0) break; /* fits with at least 25% to spare */ } @@ -1608,10 +1766,10 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); } if (error) @@ -1631,29 +1789,35 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) * If two leaves are 37% full, when combined they will leave 25% free. */ int -xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr3_leaf_remove( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_hdr_t *hdr; - xfs_attr_leaf_map_t *map; - xfs_attr_leaf_entry_t *entry; - int before, after, smallest, entsize; - int tablesize, tmp, i; - xfs_mount_t *mp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + int before; + int after; + int smallest; + int entsize; + int tablesize; + int tmp; + int i; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - hdr = &leaf->hdr; - mp = args->trans->t_mountp; - ASSERT((be16_to_cpu(hdr->count) > 0) - && (be16_to_cpu(hdr->count) < (XFS_LBSIZE(mp)/8))); - ASSERT((args->index >= 0) - && (args->index < be16_to_cpu(hdr->count))); - ASSERT(be16_to_cpu(hdr->firstused) >= - ((be16_to_cpu(hdr->count) * sizeof(*entry)) + sizeof(*hdr))); - entry = &leaf->entries[args->index]; - ASSERT(be16_to_cpu(entry->nameidx) >= be16_to_cpu(hdr->firstused)); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + trace_xfs_attr_leaf_remove(args); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + + ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); + ASSERT(args->index >= 0 && args->index < ichdr.count); + ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) + + xfs_attr3_leaf_hdr_size(leaf)); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; + + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); /* * Scan through free region table: @@ -1661,30 +1825,28 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * find smallest free region in case we need to replace it, * adjust any map that borders the entry table, */ - tablesize = be16_to_cpu(hdr->count) * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - map = &hdr->freemap[0]; - tmp = be16_to_cpu(map->size); + tablesize = ichdr.count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf); + tmp = ichdr.freemap[0].size; before = after = -1; smallest = XFS_ATTR_LEAF_MAPSIZE - 1; entsize = xfs_attr_leaf_entsize(leaf, args->index); - for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; map++, i++) { - ASSERT(be16_to_cpu(map->base) < XFS_LBSIZE(mp)); - ASSERT(be16_to_cpu(map->size) < XFS_LBSIZE(mp)); - if (be16_to_cpu(map->base) == tablesize) { - be16_add(&map->base, - -((int)sizeof(xfs_attr_leaf_entry_t))); - be16_add(&map->size, sizeof(xfs_attr_leaf_entry_t)); + for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { + ASSERT(ichdr.freemap[i].base < args->geo->blksize); + ASSERT(ichdr.freemap[i].size < args->geo->blksize); + if (ichdr.freemap[i].base == tablesize) { + ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t); + ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t); } - if ((be16_to_cpu(map->base) + be16_to_cpu(map->size)) - == be16_to_cpu(entry->nameidx)) { + if (ichdr.freemap[i].base + ichdr.freemap[i].size == + be16_to_cpu(entry->nameidx)) { before = i; - } else if (be16_to_cpu(map->base) - == (be16_to_cpu(entry->nameidx) + entsize)) { + } else if (ichdr.freemap[i].base == + (be16_to_cpu(entry->nameidx) + entsize)) { after = i; - } else if (be16_to_cpu(map->size) < tmp) { - tmp = be16_to_cpu(map->size); + } else if (ichdr.freemap[i].size < tmp) { + tmp = ichdr.freemap[i].size; smallest = i; } } @@ -1695,36 +1857,30 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) */ if ((before >= 0) || (after >= 0)) { if ((before >= 0) && (after >= 0)) { - map = &hdr->freemap[before]; - be16_add(&map->size, entsize); - be16_add(&map->size, - be16_to_cpu(hdr->freemap[after].size)); - hdr->freemap[after].base = 0; - hdr->freemap[after].size = 0; + ichdr.freemap[before].size += entsize; + ichdr.freemap[before].size += ichdr.freemap[after].size; + ichdr.freemap[after].base = 0; + ichdr.freemap[after].size = 0; } else if (before >= 0) { - map = &hdr->freemap[before]; - be16_add(&map->size, entsize); + ichdr.freemap[before].size += entsize; } else { - map = &hdr->freemap[after]; - /* both on-disk, don't endian flip twice */ - map->base = entry->nameidx; - be16_add(&map->size, entsize); + ichdr.freemap[after].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[after].size += entsize; } } else { /* * Replace smallest region (if it is smaller than free'd entry) */ - map = &hdr->freemap[smallest]; - if (be16_to_cpu(map->size) < entsize) { - map->base = cpu_to_be16(be16_to_cpu(entry->nameidx)); - map->size = cpu_to_be16(entsize); + if (ichdr.freemap[smallest].size < entsize) { + ichdr.freemap[smallest].base = be16_to_cpu(entry->nameidx); + ichdr.freemap[smallest].size = entsize; } } /* * Did we remove the first entry? */ - if (be16_to_cpu(entry->nameidx) == be16_to_cpu(hdr->firstused)) + if (be16_to_cpu(entry->nameidx) == ichdr.firstused) smallest = 1; else smallest = 0; @@ -1732,20 +1888,20 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) /* * Compress the remaining entries and zero out the removed stuff. */ - memset(XFS_ATTR_LEAF_NAME(leaf, args->index), 0, entsize); - be16_add(&hdr->usedbytes, -entsize); - xfs_da_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, XFS_ATTR_LEAF_NAME(leaf, args->index), + memset(xfs_attr3_leaf_name(leaf, args->index), 0, entsize); + ichdr.usedbytes -= entsize; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), entsize)); - tmp = (be16_to_cpu(hdr->count) - args->index) - * sizeof(xfs_attr_leaf_entry_t); - memmove((char *)entry, (char *)(entry+1), tmp); - be16_add(&hdr->count, -1); - xfs_da_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(*entry))); - entry = &leaf->entries[be16_to_cpu(hdr->count)]; - memset((char *)entry, 0, sizeof(xfs_attr_leaf_entry_t)); + tmp = (ichdr.count - args->index) * sizeof(xfs_attr_leaf_entry_t); + memmove(entry, entry + 1, tmp); + ichdr.count--; + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, entry, tmp + sizeof(xfs_attr_leaf_entry_t))); + + entry = &xfs_attr3_leaf_entryp(leaf)[ichdr.count]; + memset(entry, 0, sizeof(xfs_attr_leaf_entry_t)); /* * If we removed the first entry, re-find the first used byte @@ -1754,129 +1910,146 @@ xfs_attr_leaf_remove(xfs_dabuf_t *bp, xfs_da_args_t *args) * removing the name. */ if (smallest) { - tmp = XFS_LBSIZE(mp); - entry = &leaf->entries[0]; - for (i = be16_to_cpu(hdr->count)-1; i >= 0; entry++, i--) { - ASSERT(be16_to_cpu(entry->nameidx) >= - be16_to_cpu(hdr->firstused)); - ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp)); + tmp = args->geo->blksize; + entry = xfs_attr3_leaf_entryp(leaf); + for (i = ichdr.count - 1; i >= 0; entry++, i--) { + ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused); + ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize); if (be16_to_cpu(entry->nameidx) < tmp) tmp = be16_to_cpu(entry->nameidx); } - hdr->firstused = cpu_to_be16(tmp); - if (!hdr->firstused) { - hdr->firstused = cpu_to_be16( - tmp - XFS_ATTR_LEAF_NAME_ALIGN); - } + ichdr.firstused = tmp; + if (!ichdr.firstused) + ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; } else { - hdr->holes = 1; /* mark as needing compaction */ + ichdr.holes = 1; /* mark as needing compaction */ } - xfs_da_log_buf(args->trans, bp, - XFS_DA_LOGRANGE(leaf, hdr, sizeof(*hdr))); + xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); /* * Check if leaf is less than 50% full, caller may want to * "join" the leaf with a sibling if so. */ - tmp = sizeof(xfs_attr_leaf_hdr_t); - tmp += be16_to_cpu(leaf->hdr.count) * sizeof(xfs_attr_leaf_entry_t); - tmp += be16_to_cpu(leaf->hdr.usedbytes); - return(tmp < mp->m_attr_magicpct); /* leaf is < 37% full */ + tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) + + ichdr.count * sizeof(xfs_attr_leaf_entry_t); + + return tmp < args->geo->magicpct; /* leaf is < 37% full */ } /* * Move all the attribute list entries from drop_leaf into save_leaf. */ void -xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_attr3_leaf_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_attr_leafblock_t *drop_leaf, *save_leaf, *tmp_leaf; - xfs_attr_leaf_hdr_t *drop_hdr, *save_hdr, *tmp_hdr; - xfs_mount_t *mp; - char *tmpbuffer; + struct xfs_attr_leafblock *drop_leaf = drop_blk->bp->b_addr; + struct xfs_attr_leafblock *save_leaf = save_blk->bp->b_addr; + struct xfs_attr3_icleaf_hdr drophdr; + struct xfs_attr3_icleaf_hdr savehdr; + struct xfs_attr_leaf_entry *entry; - /* - * Set up environment. - */ - mp = state->mp; - ASSERT(drop_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == XFS_ATTR_LEAF_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - drop_hdr = &drop_leaf->hdr; - save_hdr = &save_leaf->hdr; + trace_xfs_attr_leaf_unbalance(state->args); + + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + entry = xfs_attr3_leaf_entryp(drop_leaf); /* * Save last hashval from dying block for later Btree fixup. */ - drop_blk->hashval = be32_to_cpu( - drop_leaf->entries[be16_to_cpu(drop_leaf->hdr.count)-1].hashval); + drop_blk->hashval = be32_to_cpu(entry[drophdr.count - 1].hashval); /* * Check if we need a temp buffer, or can we do it in place. * Note that we don't check "leaf" for holes because we will * always be dropping it, toosmall() decided that for us already. */ - if (save_hdr->holes == 0) { + if (savehdr.holes == 0) { /* * dest leaf has no holes, so we add there. May need * to make some room in the entry array. */ - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, 0, + drophdr.count); } else { - xfs_attr_leaf_moveents(drop_leaf, 0, save_leaf, - be16_to_cpu(save_hdr->count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + save_leaf, &savehdr, + savehdr.count, drophdr.count); } } else { /* * Destination has holes, so we make a temporary copy * of the leaf and add them both to that. */ - tmpbuffer = kmem_alloc(state->blocksize, KM_SLEEP); - ASSERT(tmpbuffer != NULL); - memset(tmpbuffer, 0, state->blocksize); - tmp_leaf = (xfs_attr_leafblock_t *)tmpbuffer; - tmp_hdr = &tmp_leaf->hdr; - tmp_hdr->info = save_hdr->info; /* struct copy */ - tmp_hdr->count = 0; - tmp_hdr->firstused = cpu_to_be16(state->blocksize); - if (!tmp_hdr->firstused) { - tmp_hdr->firstused = cpu_to_be16( - state->blocksize - XFS_ATTR_LEAF_NAME_ALIGN); - } - tmp_hdr->usedbytes = 0; - if (xfs_attr_leaf_order(save_blk->bp, drop_blk->bp)) { - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, 0, - be16_to_cpu(drop_hdr->count), mp); - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(save_hdr->count), mp); + struct xfs_attr_leafblock *tmp_leaf; + struct xfs_attr3_icleaf_hdr tmphdr; + + tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP); + + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); + tmphdr.magic = savehdr.magic; + tmphdr.forw = savehdr.forw; + tmphdr.back = savehdr.back; + tmphdr.firstused = state->args->geo->blksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, + drop_blk->bp, &drophdr)) { + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, 0, + drophdr.count); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + savehdr.count); } else { - xfs_attr_leaf_moveents(save_leaf, 0, tmp_leaf, 0, - be16_to_cpu(save_hdr->count), mp); - xfs_attr_leaf_moveents(drop_leaf, 0, tmp_leaf, - be16_to_cpu(tmp_leaf->hdr.count), - be16_to_cpu(drop_hdr->count), mp); + xfs_attr3_leaf_moveents(state->args, + save_leaf, &savehdr, 0, + tmp_leaf, &tmphdr, 0, + savehdr.count); + xfs_attr3_leaf_moveents(state->args, + drop_leaf, &drophdr, 0, + tmp_leaf, &tmphdr, tmphdr.count, + drophdr.count); } - memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize); - kmem_free(tmpbuffer, state->blocksize); + memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); + savehdr = tmphdr; /* struct copy */ + kmem_free(tmp_leaf); } - xfs_da_log_buf(state->args->trans, save_blk->bp, 0, - state->blocksize - 1); + xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, + state->args->geo->blksize - 1); /* * Copy out last hashval in each block for B-tree code. */ - save_blk->hashval = be32_to_cpu( - save_leaf->entries[be16_to_cpu(save_leaf->hdr.count)-1].hashval); + entry = xfs_attr3_leaf_entryp(save_leaf); + save_blk->hashval = be32_to_cpu(entry[savehdr.count - 1].hashval); } /*======================================================================== @@ -1897,27 +2070,33 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Don't change the args->value unless we find the attribute. */ int -xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr3_leaf_lookup_int( + struct xfs_buf *bp, + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int probe, span; - xfs_dahash_t hashval; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + xfs_dahash_t hashval; + int probe; + int span; + + trace_xfs_attr_leaf_lookup(args); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); /* * Binary search. (note: small blocks will skip this loop) */ hashval = args->hashval; - probe = span = be16_to_cpu(leaf->hdr.count) / 2; - for (entry = &leaf->entries[probe]; span > 4; - entry = &leaf->entries[probe]) { + probe = span = ichdr.count / 2; + for (entry = &entries[probe]; span > 4; entry = &entries[probe]) { span /= 2; if (be32_to_cpu(entry->hashval) < hashval) probe += span; @@ -1926,35 +2105,31 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) else break; } - ASSERT((probe >= 0) && - (!leaf->hdr.count - || (probe < be16_to_cpu(leaf->hdr.count)))); - ASSERT((span <= 4) || (be32_to_cpu(entry->hashval) == hashval)); + ASSERT(probe >= 0 && (!ichdr.count || probe < ichdr.count)); + ASSERT(span <= 4 || be32_to_cpu(entry->hashval) == hashval); /* * Since we may have duplicate hashval's, find the first matching * hashval in the leaf. */ - while ((probe > 0) && (be32_to_cpu(entry->hashval) >= hashval)) { + while (probe > 0 && be32_to_cpu(entry->hashval) >= hashval) { entry--; probe--; } - while ((probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) < hashval)) { + while (probe < ichdr.count && + be32_to_cpu(entry->hashval) < hashval) { entry++; probe++; } - if ((probe == be16_to_cpu(leaf->hdr.count)) || - (be32_to_cpu(entry->hashval) != hashval)) { + if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* * Duplicate keys may be present, so search all of them for a match. */ - for ( ; (probe < be16_to_cpu(leaf->hdr.count)) && - (be32_to_cpu(entry->hashval) == hashval); + for (; probe < ichdr.count && (be32_to_cpu(entry->hashval) == hashval); entry++, probe++) { /* * GROT: Add code to remove incomplete entries. @@ -1968,42 +2143,36 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) continue; } if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, probe); + name_loc = xfs_attr3_leaf_name_local(leaf, probe); if (name_loc->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_loc->nameval, - args->namelen) != 0) - continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) + if (memcmp(args->name, name_loc->nameval, + args->namelen) != 0) continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - return(XFS_ERROR(EEXIST)); + return XFS_ERROR(EEXIST); } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, probe); + name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); if (name_rmt->namelen != args->namelen) continue; - if (memcmp(args->name, (char *)name_rmt->name, - args->namelen) != 0) - continue; - if (((args->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0)) + if (memcmp(args->name, name_rmt->name, + args->namelen) != 0) continue; - if (((args->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0)) + if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - return(XFS_ERROR(EEXIST)); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->rmtvaluelen); + return XFS_ERROR(EEXIST); } } args->index = probe; - return(XFS_ERROR(ENOATTR)); + return XFS_ERROR(ENOATTR); } /* @@ -2011,54 +2180,57 @@ xfs_attr_leaf_lookup_int(xfs_dabuf_t *bp, xfs_da_args_t *args) * list structure. */ int -xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) +xfs_attr3_leaf_getvalue( + struct xfs_buf *bp, + struct xfs_da_args *args) { - int valuelen; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf->hdr.count) - < (XFS_LBSIZE(args->dp->i_mount)/8)); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - - entry = &leaf->entries[args->index]; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_local *name_loc; + struct xfs_attr_leaf_name_remote *name_rmt; + int valuelen; + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(ichdr.count < args->geo->blksize / 8); + ASSERT(args->index < ichdr.count); + + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); ASSERT(name_loc->namelen == args->namelen); ASSERT(memcmp(args->name, name_loc->nameval, args->namelen) == 0); valuelen = be16_to_cpu(name_loc->valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; - return(0); + return 0; } if (args->valuelen < valuelen) { args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + return XFS_ERROR(ERANGE); } args->valuelen = valuelen; memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; - return(0); + args->valuelen = args->rmtvaluelen; + return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; - return(XFS_ERROR(ERANGE)); + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; + return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } - return(0); + return 0; } /*======================================================================== @@ -2071,13 +2243,21 @@ xfs_attr_leaf_getvalue(xfs_dabuf_t *bp, xfs_da_args_t *args) */ /*ARGSUSED*/ STATIC void -xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, - xfs_attr_leafblock_t *leaf_d, int start_d, - int count, xfs_mount_t *mp) +xfs_attr3_leaf_moveents( + struct xfs_da_args *args, + struct xfs_attr_leafblock *leaf_s, + struct xfs_attr3_icleaf_hdr *ichdr_s, + int start_s, + struct xfs_attr_leafblock *leaf_d, + struct xfs_attr3_icleaf_hdr *ichdr_d, + int start_d, + int count) { - xfs_attr_leaf_hdr_t *hdr_s, *hdr_d; - xfs_attr_leaf_entry_t *entry_s, *entry_d; - int desti, tmp, i; + struct xfs_attr_leaf_entry *entry_s; + struct xfs_attr_leaf_entry *entry_d; + int desti; + int tmp; + int i; /* * Check for nothing to do. @@ -2088,45 +2268,41 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Set up environment. */ - ASSERT(be16_to_cpu(leaf_s->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(be16_to_cpu(leaf_d->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - hdr_s = &leaf_s->hdr; - hdr_d = &leaf_d->hdr; - ASSERT((be16_to_cpu(hdr_s->count) > 0) && - (be16_to_cpu(hdr_s->count) < (XFS_LBSIZE(mp)/8))); - ASSERT(be16_to_cpu(hdr_s->firstused) >= - ((be16_to_cpu(hdr_s->count) - * sizeof(*entry_s))+sizeof(*hdr_s))); - ASSERT(be16_to_cpu(hdr_d->count) < (XFS_LBSIZE(mp)/8)); - ASSERT(be16_to_cpu(hdr_d->firstused) >= - ((be16_to_cpu(hdr_d->count) - * sizeof(*entry_d))+sizeof(*hdr_d))); - - ASSERT(start_s < be16_to_cpu(hdr_s->count)); - ASSERT(start_d <= be16_to_cpu(hdr_d->count)); - ASSERT(count <= be16_to_cpu(hdr_s->count)); + ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC || + ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC); + ASSERT(ichdr_s->magic == ichdr_d->magic); + ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8); + ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s)) + + xfs_attr3_leaf_hdr_size(leaf_s)); + ASSERT(ichdr_d->count < args->geo->blksize / 8); + ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d)) + + xfs_attr3_leaf_hdr_size(leaf_d)); + + ASSERT(start_s < ichdr_s->count); + ASSERT(start_d <= ichdr_d->count); + ASSERT(count <= ichdr_s->count); + /* * Move the entries in the destination leaf up to make a hole? */ - if (start_d < be16_to_cpu(hdr_d->count)) { - tmp = be16_to_cpu(hdr_d->count) - start_d; + if (start_d < ichdr_d->count) { + tmp = ichdr_d->count - start_d; tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_d->entries[start_d]; - entry_d = &leaf_d->entries[start_d + count]; - memmove((char *)entry_d, (char *)entry_s, tmp); + entry_s = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d + count]; + memmove(entry_d, entry_s, tmp); } /* * Copy all entry's in the same (sorted) order, * but allocate attribute info packed and in sequence. */ - entry_s = &leaf_s->entries[start_s]; - entry_d = &leaf_d->entries[start_d]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + entry_d = &xfs_attr3_leaf_entryp(leaf_d)[start_d]; desti = start_d; for (i = 0; i < count; entry_s++, entry_d++, desti++, i++) { - ASSERT(be16_to_cpu(entry_s->nameidx) - >= be16_to_cpu(hdr_s->firstused)); + ASSERT(be16_to_cpu(entry_s->nameidx) >= ichdr_s->firstused); tmp = xfs_attr_leaf_entsize(leaf_s, start_s + i); #ifdef GROT /* @@ -2135,36 +2311,34 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, * off for 6.2, should be revisited later. */ if (entry_s->flags & XFS_ATTR_INCOMPLETE) { /* skip partials? */ - memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); - be16_add(&hdr_s->usedbytes, -tmp); - be16_add(&hdr_s->count, -1); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_s->count -= 1; entry_d--; /* to compensate for ++ in loop hdr */ desti--; if ((start_s + i) < offset) result++; /* insertion index adjustment */ } else { #endif /* GROT */ - be16_add(&hdr_d->firstused, -tmp); + ichdr_d->firstused -= tmp; /* both on-disk, don't endian flip twice */ entry_d->hashval = entry_s->hashval; - /* both on-disk, don't endian flip twice */ - entry_d->nameidx = hdr_d->firstused; + entry_d->nameidx = cpu_to_be16(ichdr_d->firstused); entry_d->flags = entry_s->flags; ASSERT(be16_to_cpu(entry_d->nameidx) + tmp - <= XFS_LBSIZE(mp)); - memmove(XFS_ATTR_LEAF_NAME(leaf_d, desti), - XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), tmp); + <= args->geo->blksize); + memmove(xfs_attr3_leaf_name(leaf_d, desti), + xfs_attr3_leaf_name(leaf_s, start_s + i), tmp); ASSERT(be16_to_cpu(entry_s->nameidx) + tmp - <= XFS_LBSIZE(mp)); - memset(XFS_ATTR_LEAF_NAME(leaf_s, start_s + i), 0, tmp); - be16_add(&hdr_s->usedbytes, -tmp); - be16_add(&hdr_d->usedbytes, tmp); - be16_add(&hdr_s->count, -1); - be16_add(&hdr_d->count, 1); - tmp = be16_to_cpu(hdr_d->count) - * sizeof(xfs_attr_leaf_entry_t) - + sizeof(xfs_attr_leaf_hdr_t); - ASSERT(be16_to_cpu(hdr_d->firstused) >= tmp); + <= args->geo->blksize); + memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp); + ichdr_s->usedbytes -= tmp; + ichdr_d->usedbytes += tmp; + ichdr_s->count -= 1; + ichdr_d->count += 1; + tmp = ichdr_d->count * sizeof(xfs_attr_leaf_entry_t) + + xfs_attr3_leaf_hdr_size(leaf_d); + ASSERT(ichdr_d->firstused >= tmp); #ifdef GROT } #endif /* GROT */ @@ -2173,86 +2347,60 @@ xfs_attr_leaf_moveents(xfs_attr_leafblock_t *leaf_s, int start_s, /* * Zero out the entries we just copied. */ - if (start_s == be16_to_cpu(hdr_s->count)) { + if (start_s == ichdr_s->count) { tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); } else { /* * Move the remaining entries down to fill the hole, * then zero the entries at the top. */ - tmp = be16_to_cpu(hdr_s->count) - count; - tmp *= sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[start_s + count]; - entry_d = &leaf_s->entries[start_s]; - memmove((char *)entry_d, (char *)entry_s, tmp); + tmp = (ichdr_s->count - count) * sizeof(xfs_attr_leaf_entry_t); + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s + count]; + entry_d = &xfs_attr3_leaf_entryp(leaf_s)[start_s]; + memmove(entry_d, entry_s, tmp); tmp = count * sizeof(xfs_attr_leaf_entry_t); - entry_s = &leaf_s->entries[be16_to_cpu(hdr_s->count)]; + entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count]; ASSERT(((char *)entry_s + tmp) <= - ((char *)leaf_s + XFS_LBSIZE(mp))); - memset((char *)entry_s, 0, tmp); + ((char *)leaf_s + args->geo->blksize)); + memset(entry_s, 0, tmp); } /* * Fill in the freemap information */ - hdr_d->freemap[0].base = cpu_to_be16(sizeof(xfs_attr_leaf_hdr_t)); - be16_add(&hdr_d->freemap[0].base, be16_to_cpu(hdr_d->count) * - sizeof(xfs_attr_leaf_entry_t)); - hdr_d->freemap[0].size = cpu_to_be16(be16_to_cpu(hdr_d->firstused) - - be16_to_cpu(hdr_d->freemap[0].base)); - hdr_d->freemap[1].base = 0; - hdr_d->freemap[2].base = 0; - hdr_d->freemap[1].size = 0; - hdr_d->freemap[2].size = 0; - hdr_s->holes = 1; /* leaf may not be compact */ -} - -/* - * Compare two leaf blocks "order". - * Return 0 unless leaf2 should go before leaf1. - */ -int -xfs_attr_leaf_order(xfs_dabuf_t *leaf1_bp, xfs_dabuf_t *leaf2_bp) -{ - xfs_attr_leafblock_t *leaf1, *leaf2; - - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; - ASSERT((be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC) && - (be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC)); - if ((be16_to_cpu(leaf1->hdr.count) > 0) && - (be16_to_cpu(leaf2->hdr.count) > 0) && - ((be32_to_cpu(leaf2->entries[0].hashval) < - be32_to_cpu(leaf1->entries[0].hashval)) || - (be32_to_cpu(leaf2->entries[ - be16_to_cpu(leaf2->hdr.count)-1].hashval) < - be32_to_cpu(leaf1->entries[ - be16_to_cpu(leaf1->hdr.count)-1].hashval)))) { - return(1); - } - return(0); + ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_d); + ichdr_d->freemap[0].base += ichdr_d->count * sizeof(xfs_attr_leaf_entry_t); + ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + ichdr_d->freemap[1].base = 0; + ichdr_d->freemap[2].base = 0; + ichdr_d->freemap[1].size = 0; + ichdr_d->freemap[2].size = 0; + ichdr_s->holes = 1; /* leaf may not be compact */ } /* * Pick up the last hashvalue from a leaf block. */ xfs_dahash_t -xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) +xfs_attr_leaf_lasthash( + struct xfs_buf *bp, + int *count) { - xfs_attr_leafblock_t *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); + xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) - return(0); - return be32_to_cpu(leaf->entries[be16_to_cpu(leaf->hdr.count)-1].hashval); + *count = ichdr.count; + if (!ichdr.count) + return 0; + return be32_to_cpu(entries[ichdr.count - 1].hashval); } /* @@ -2262,20 +2410,21 @@ xfs_attr_leaf_lasthash(xfs_dabuf_t *bp, int *count) STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) { + struct xfs_attr_leaf_entry *entries; xfs_attr_leaf_name_local_t *name_loc; xfs_attr_leaf_name_remote_t *name_rmt; int size; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - if (leaf->entries[index].flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, index); - size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(name_loc->namelen, + entries = xfs_attr3_leaf_entryp(leaf); + if (entries[index].flags & XFS_ATTR_LOCAL) { + name_loc = xfs_attr3_leaf_name_local(leaf, index); + size = xfs_attr_leaf_entsize_local(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, index); - size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(name_rmt->namelen); + name_rmt = xfs_attr3_leaf_name_remote(leaf, index); + size = xfs_attr_leaf_entsize_remote(name_rmt->namelen); } - return(size); + return size; } /* @@ -2285,196 +2434,23 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index) * a "local" or a "remote" attribute. */ int -xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local) +xfs_attr_leaf_newentsize( + struct xfs_da_args *args, + int *local) { - int size; + int size; - size = XFS_ATTR_LEAF_ENTSIZE_LOCAL(namelen, valuelen); - if (size < XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(blocksize)) { - if (local) { + size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen); + if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) { + if (local) *local = 1; - } - } else { - size = XFS_ATTR_LEAF_ENTSIZE_REMOTE(namelen); - if (local) { - *local = 0; - } + return size; } - return(size); + if (local) + *local = 0; + return xfs_attr_leaf_entsize_remote(args->namelen); } -/* - * Copy out attribute list entries for attr_list(), for leaf attribute lists. - */ -int -xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) -{ - attrlist_cursor_kern_t *cursor; - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_local_t *name_loc; - xfs_attr_leaf_name_remote_t *name_rmt; - int retval, i; - - ASSERT(bp != NULL); - leaf = bp->data; - cursor = context->cursor; - cursor->initted = 1; - - xfs_attr_trace_l_cl("blk start", context, leaf); - - /* - * Re-find our place in the leaf block if this is a new syscall. - */ - if (context->resynch) { - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be32_to_cpu(entry->hashval) == cursor->hashval) { - if (cursor->offset == context->dupcnt) { - context->dupcnt = 0; - break; - } - context->dupcnt++; - } else if (be32_to_cpu(entry->hashval) > - cursor->hashval) { - context->dupcnt = 0; - break; - } - } - if (i == be16_to_cpu(leaf->hdr.count)) { - xfs_attr_trace_l_c("not found", context); - return(0); - } - } else { - entry = &leaf->entries[0]; - i = 0; - } - context->resynch = 0; - - /* - * We have found our place, start copying out the new attributes. - */ - retval = 0; - for ( ; (i < be16_to_cpu(leaf->hdr.count)) - && (retval == 0); entry++, i++) { - attrnames_t *namesp; - - if (be32_to_cpu(entry->hashval) != cursor->hashval) { - cursor->hashval = be32_to_cpu(entry->hashval); - cursor->offset = 0; - } - - if (entry->flags & XFS_ATTR_INCOMPLETE) - continue; /* skip incomplete entries */ - if (((context->flags & ATTR_SECURE) != 0) != - ((entry->flags & XFS_ATTR_SECURE) != 0) && - !(context->flags & ATTR_KERNORMALS)) - continue; /* skip non-matching entries */ - if (((context->flags & ATTR_ROOT) != 0) != - ((entry->flags & XFS_ATTR_ROOT) != 0) && - !(context->flags & ATTR_KERNROOTLS)) - continue; /* skip non-matching entries */ - - namesp = (entry->flags & XFS_ATTR_SECURE) ? &attr_secure : - ((entry->flags & XFS_ATTR_ROOT) ? &attr_trusted : - &attr_user); - - if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_loc->namelen + 1; - } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_loc->nameval, - (int)name_loc->namelen, - be16_to_cpu(name_loc->valuelen)); - } - } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); - if (context->flags & ATTR_KERNOVAL) { - ASSERT(context->flags & ATTR_KERNAMELS); - context->count += namesp->attr_namelen + - (int)name_rmt->namelen + 1; - } else { - retval = xfs_attr_put_listent(context, namesp, - (char *)name_rmt->name, - (int)name_rmt->namelen, - be32_to_cpu(name_rmt->valuelen)); - } - } - if (retval == 0) { - cursor->offset++; - } - } - xfs_attr_trace_l_cl("blk end", context, leaf); - return(retval); -} - -#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ - (((struct attrlist_ent *) 0)->a_name - (char *) 0) -#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ - ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ - & ~(sizeof(u_int32_t)-1)) - -/* - * Format an attribute and copy it out to the user's buffer. - * Take care to check values and protect against them changing later, - * we may be reading them directly out of a user buffer. - */ -/*ARGSUSED*/ -STATIC int -xfs_attr_put_listent(xfs_attr_list_context_t *context, - attrnames_t *namesp, char *name, int namelen, int valuelen) -{ - attrlist_ent_t *aep; - int arraytop; - - ASSERT(!(context->flags & ATTR_KERNOVAL)); - if (context->flags & ATTR_KERNAMELS) { - char *offset; - - ASSERT(context->count >= 0); - - arraytop = context->count + namesp->attr_namelen + namelen + 1; - if (arraytop > context->firstu) { - context->count = -1; /* insufficient space */ - return(1); - } - offset = (char *)context->alist + context->count; - strncpy(offset, namesp->attr_name, namesp->attr_namelen); - offset += namesp->attr_namelen; - strncpy(offset, name, namelen); /* real name */ - offset += namelen; - *offset = '\0'; - context->count += namesp->attr_namelen + namelen + 1; - return(0); - } - - ASSERT(context->count >= 0); - ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); - ASSERT(context->firstu >= sizeof(*context->alist)); - ASSERT(context->firstu <= context->bufsize); - - arraytop = sizeof(*context->alist) + - context->count * sizeof(context->alist->al_offset[0]); - context->firstu -= ATTR_ENTSIZE(namelen); - if (context->firstu < arraytop) { - xfs_attr_trace_l_c("buffer full", context); - context->alist->al_more = 1; - return(1); - } - - aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]); - aep->a_valuelen = valuelen; - memcpy(aep->a_name, name, namelen); - aep->a_name[ namelen ] = 0; - context->alist->al_offset[ context->count++ ] = context->firstu; - context->alist->al_count = context->count; - xfs_attr_trace_l_c("add", context); - return(0); -} /*======================================================================== * Manage the INCOMPLETE flag in a leaf entry @@ -2484,43 +2460,44 @@ xfs_attr_put_listent(xfs_attr_list_context_t *context, * Clear the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_clearflag(xfs_da_args_t *args) +xfs_attr3_leaf_clearflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; - int error; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; + int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; xfs_attr_leaf_name_local_t *name_loc; int namelen; char *name; #endif /* DEBUG */ + trace_xfs_attr_leaf_clearflag(args); /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); - ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; + leaf = bp->b_addr; + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); + ASSERT(args->index >= 0); + if (entry->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf, args->index); namelen = name_loc->namelen; name = (char *)name_loc->nameval; } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); namelen = name_rmt->namelen; name = (char *)name_rmt->name; } @@ -2530,74 +2507,73 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args) #endif /* DEBUG */ entry->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if (args->rmtblkno) { ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp, + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* * Set the INCOMPLETE flag on an entry in a leaf block. */ int -xfs_attr_leaf_setflag(xfs_da_args_t *args) +xfs_attr3_leaf_setflag( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp; + struct xfs_attr_leafblock *leaf; + struct xfs_attr_leaf_entry *entry; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp; int error; +#ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr; +#endif + + trace_xfs_attr_leaf_setflag(args); /* * Set up the operation. */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp, - XFS_ATTR_FORK); - if (error) { + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); + if (error) return(error); - } - ASSERT(bp != NULL); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(args->index < be16_to_cpu(leaf->hdr.count)); + leaf = bp->b_addr; +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); - entry = &leaf->entries[ args->index ]; +#endif + entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; ASSERT((entry->flags & XFS_ATTR_INCOMPLETE) == 0); entry->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, entry, sizeof(*entry))); if ((entry->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp, + xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp); /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); + return xfs_trans_roll(&args->trans, args->dp); } /* @@ -2608,71 +2584,76 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args) * Note that they could be in different blocks, or in the same block. */ int -xfs_attr_leaf_flipflags(xfs_da_args_t *args) +xfs_attr3_leaf_flipflags( + struct xfs_da_args *args) { - xfs_attr_leafblock_t *leaf1, *leaf2; - xfs_attr_leaf_entry_t *entry1, *entry2; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_dabuf_t *bp1, *bp2; + struct xfs_attr_leafblock *leaf1; + struct xfs_attr_leafblock *leaf2; + struct xfs_attr_leaf_entry *entry1; + struct xfs_attr_leaf_entry *entry2; + struct xfs_attr_leaf_name_remote *name_rmt; + struct xfs_buf *bp1; + struct xfs_buf *bp2; int error; #ifdef DEBUG + struct xfs_attr3_icleaf_hdr ichdr1; + struct xfs_attr3_icleaf_hdr ichdr2; xfs_attr_leaf_name_local_t *name_loc; int namelen1, namelen2; char *name1, *name2; #endif /* DEBUG */ + trace_xfs_attr_leaf_flipflags(args); + /* * Read the block containing the "old" attr */ - error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1, - XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp1 != NULL); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1); + if (error) + return error; /* * Read the block containing the "new" attr, if it is different */ if (args->blkno2 != args->blkno) { - error = xfs_da_read_buf(args->trans, args->dp, args->blkno2, - -1, &bp2, XFS_ATTR_FORK); - if (error) { - return(error); - } - ASSERT(bp2 != NULL); + error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno2, + -1, &bp2); + if (error) + return error; } else { bp2 = bp1; } - leaf1 = bp1->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(args->index < be16_to_cpu(leaf1->hdr.count)); + leaf1 = bp1->b_addr; + entry1 = &xfs_attr3_leaf_entryp(leaf1)[args->index]; + + leaf2 = bp2->b_addr; + entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; + +#ifdef DEBUG + xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); - entry1 = &leaf1->entries[ args->index ]; - leaf2 = bp2->data; - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count)); + xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); - entry2 = &leaf2->entries[ args->index2 ]; -#ifdef DEBUG if (entry1->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf1, args->index); + name_loc = xfs_attr3_leaf_name_local(leaf1, args->index); namelen1 = name_loc->namelen; name1 = (char *)name_loc->nameval; } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); namelen1 = name_rmt->namelen; name1 = (char *)name_rmt->name; } if (entry2->flags & XFS_ATTR_LOCAL) { - name_loc = XFS_ATTR_LEAF_NAME_LOCAL(leaf2, args->index2); + name_loc = xfs_attr3_leaf_name_local(leaf2, args->index2); namelen2 = name_loc->namelen; name2 = (char *)name_loc->nameval; } else { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); namelen2 = name_rmt->namelen; name2 = (char *)name_rmt->name; } @@ -2685,397 +2666,32 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args) ASSERT((entry2->flags & XFS_ATTR_INCOMPLETE) == 0); entry1->flags &= ~XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp1, + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, entry1, sizeof(*entry1))); if (args->rmtblkno) { ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf1, args->index); + name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); - xfs_da_log_buf(args->trans, bp1, + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); + xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } entry2->flags |= XFS_ATTR_INCOMPLETE; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, entry2, sizeof(*entry2))); if ((entry2->flags & XFS_ATTR_LOCAL) == 0) { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf2, args->index2); + name_rmt = xfs_attr3_leaf_name_remote(leaf2, args->index2); name_rmt->valueblk = 0; name_rmt->valuelen = 0; - xfs_da_log_buf(args->trans, bp2, + xfs_trans_log_buf(args->trans, bp2, XFS_DA_LOGRANGE(leaf2, name_rmt, sizeof(*name_rmt))); } - xfs_da_buf_done(bp1); - if (bp1 != bp2) - xfs_da_buf_done(bp2); /* * Commit the flag value change and start the next trans in series. */ - error = xfs_attr_rolltrans(&args->trans, args->dp); - - return(error); -} - -/*======================================================================== - * Indiscriminately delete the entire attribute fork - *========================================================================*/ - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -int -xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp) -{ - xfs_da_blkinfo_t *info; - xfs_daddr_t blkno; - xfs_dabuf_t *bp; - int error; - - /* - * Read block 0 to see what we have to work with. - * We only get here if we have extents, since we remove - * the extents in reverse order the extent containing - * block 0 must still be there. - */ - error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK); - if (error) - return(error); - blkno = xfs_da_blkno(bp); - - /* - * Invalidate the tree, even if the "tree" is only a single leaf block. - * This is a depth-first traversal! - */ - info = bp->data; - if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { - error = xfs_attr_node_inactive(trans, dp, bp, 1); - } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { - error = xfs_attr_leaf_inactive(trans, dp, bp); - } else { - error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, bp); - } - if (error) - return(error); - - /* - * Invalidate the incore copy of the root block. - */ - error = xfs_da_get_buf(*trans, dp, 0, blkno, &bp, XFS_ATTR_FORK); - if (error) - return(error); - xfs_da_binval(*trans, bp); /* remove from cache */ - /* - * Commit the invalidate and start the next transaction. - */ - error = xfs_attr_rolltrans(trans, dp); - - return (error); -} - -/* - * Recurse (gasp!) through the attribute nodes until we find leaves. - * We're doing a depth-first traversal in order to invalidate everything. - */ -STATIC int -xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp, - int level) -{ - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_dablk_t child_fsb; - xfs_daddr_t parent_blkno, child_blkno; - int error, count, i; - xfs_dabuf_t *child_bp; - - /* - * Since this code is recursive (gasp!) we must protect ourselves. - */ - if (level > XFS_DA_NODE_MAXDEPTH) { - xfs_da_brelse(*trans, bp); /* no locks for later trans */ - return(XFS_ERROR(EIO)); - } - - node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - parent_blkno = xfs_da_blkno(bp); /* save for re-read later */ - count = be16_to_cpu(node->hdr.count); - if (!count) { - xfs_da_brelse(*trans, bp); - return(0); - } - child_fsb = be32_to_cpu(node->btree[0].before); - xfs_da_brelse(*trans, bp); /* no locks for later trans */ - - /* - * If this is the node level just above the leaves, simply loop - * over the leaves removing all of them. If this is higher up - * in the tree, recurse downward. - */ - for (i = 0; i < count; i++) { - /* - * Read the subsidiary block to see what we have to work with. - * Don't do this in a transaction. This is a depth-first - * traversal of the tree so we may deal with many blocks - * before we come back to this one. - */ - error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp, - XFS_ATTR_FORK); - if (error) - return(error); - if (child_bp) { - /* save for re-read later */ - child_blkno = xfs_da_blkno(child_bp); - - /* - * Invalidate the subtree, however we have to. - */ - info = child_bp->data; - if (be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC) { - error = xfs_attr_node_inactive(trans, dp, - child_bp, level+1); - } else if (be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC) { - error = xfs_attr_leaf_inactive(trans, dp, - child_bp); - } else { - error = XFS_ERROR(EIO); - xfs_da_brelse(*trans, child_bp); - } - if (error) - return(error); - - /* - * Remove the subsidiary block from the cache - * and from the log. - */ - error = xfs_da_get_buf(*trans, dp, 0, child_blkno, - &child_bp, XFS_ATTR_FORK); - if (error) - return(error); - xfs_da_binval(*trans, child_bp); - } - - /* - * If we're not done, re-read the parent to get the next - * child block number. - */ - if ((i+1) < count) { - error = xfs_da_read_buf(*trans, dp, 0, parent_blkno, - &bp, XFS_ATTR_FORK); - if (error) - return(error); - child_fsb = be32_to_cpu(node->btree[i+1].before); - xfs_da_brelse(*trans, bp); - } - /* - * Atomically commit the whole invalidate stuff. - */ - if ((error = xfs_attr_rolltrans(trans, dp))) - return (error); - } - - return(0); -} - -/* - * Invalidate all of the "remote" value regions pointed to by a particular - * leaf block. - * Note that we must release the lock on the buffer so that we are not - * caught holding something that the logging code wants to flush to disk. - */ -STATIC int -xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp) -{ - xfs_attr_leafblock_t *leaf; - xfs_attr_leaf_entry_t *entry; - xfs_attr_leaf_name_remote_t *name_rmt; - xfs_attr_inactive_list_t *list, *lp; - int error, count, size, tmp, i; - - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_ATTR_LEAF_MAGIC); - - /* - * Count the number of "remote" value extents. - */ - count = 0; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); - if (name_rmt->valueblk) - count++; - } - } - - /* - * If there are no "remote" values, we're done. - */ - if (count == 0) { - xfs_da_brelse(*trans, bp); - return(0); - } - - /* - * Allocate storage for a list of all the "remote" value extents. - */ - size = count * sizeof(xfs_attr_inactive_list_t); - list = (xfs_attr_inactive_list_t *)kmem_alloc(size, KM_SLEEP); - - /* - * Identify each of the "remote" value extents. - */ - lp = list; - entry = &leaf->entries[0]; - for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) { - if (be16_to_cpu(entry->nameidx) && - ((entry->flags & XFS_ATTR_LOCAL) == 0)) { - name_rmt = XFS_ATTR_LEAF_NAME_REMOTE(leaf, i); - if (name_rmt->valueblk) { - lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, - be32_to_cpu(name_rmt->valuelen)); - lp++; - } - } - } - xfs_da_brelse(*trans, bp); /* unlock for trans. in freextent() */ - - /* - * Invalidate each of the "remote" value extents. - */ - error = 0; - for (lp = list, i = 0; i < count; i++, lp++) { - tmp = xfs_attr_leaf_freextent(trans, dp, - lp->valueblk, lp->valuelen); - - if (error == 0) - error = tmp; /* save only the 1st errno */ - } - - kmem_free((xfs_caddr_t)list, size); - return(error); -} - -/* - * Look at all the extents for this logical region, - * invalidate any buffers that are incore/in transactions. - */ -STATIC int -xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp, - xfs_dablk_t blkno, int blkcnt) -{ - xfs_bmbt_irec_t map; - xfs_dablk_t tblkno; - int tblkcnt, dblkcnt, nmap, error; - xfs_daddr_t dblkno; - xfs_buf_t *bp; - - /* - * Roll through the "value", invalidating the attribute value's - * blocks. - */ - tblkno = blkno; - tblkcnt = blkcnt; - while (tblkcnt > 0) { - /* - * Try to remember where we decided to put the value. - */ - nmap = 1; - error = xfs_bmapi(*trans, dp, (xfs_fileoff_t)tblkno, tblkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - NULL, 0, &map, &nmap, NULL, NULL); - if (error) { - return(error); - } - ASSERT(nmap == 1); - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - - /* - * If it's a hole, these are already unmapped - * so there's nothing to invalidate. - */ - if (map.br_startblock != HOLESTARTBLOCK) { - - dblkno = XFS_FSB_TO_DADDR(dp->i_mount, - map.br_startblock); - dblkcnt = XFS_FSB_TO_BB(dp->i_mount, - map.br_blockcount); - bp = xfs_trans_get_buf(*trans, - dp->i_mount->m_ddev_targp, - dblkno, dblkcnt, XFS_BUF_LOCK); - xfs_trans_binval(*trans, bp); - /* - * Roll to next transaction. - */ - if ((error = xfs_attr_rolltrans(trans, dp))) - return (error); - } - - tblkno += map.br_blockcount; - tblkcnt -= map.br_blockcount; - } - - return(0); -} - - -/* - * Roll from one trans in the sequence of PERMANENT transactions to the next. - */ -int -xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp) -{ - xfs_trans_t *trans; - unsigned int logres, count; - int error; - - /* - * Ensure that the inode is always logged. - */ - trans = *transp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* - * Copy the critical parameters from one trans to the next. - */ - logres = trans->t_log_res; - count = trans->t_log_count; - *transp = xfs_trans_dup(trans); - - /* - * Commit the current transaction. - * If this commit failed, then it'd just unlock those items that - * are not marked ihold. That also means that a filesystem shutdown - * is in progress. The caller takes the responsibility to cancel - * the duplicate transaction that gets returned. - */ - if ((error = xfs_trans_commit(trans, 0, NULL))) - return (error); - - trans = *transp; - - /* - * Reserve space in the log for th next transaction. - * This also pushes items in the "AIL", the list of logged items, - * out to disk if they are taking up space at the tail of the log - * that we want to use. This requires that either nothing be locked - * across this call, or that anything that is locked be logged in - * the prior and the next transactions. - */ - error = xfs_trans_reserve(trans, 0, logres, 0, - XFS_TRANS_PERM_LOG_RES, count); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (!error) { - xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL); - xfs_trans_ihold(trans, dp); - } - return (error); + error = xfs_trans_roll(&args->trans, args->dp); + return error; } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index 51c3ee156b2..e2929da7c3b 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -18,196 +19,15 @@ #ifndef __XFS_ATTR_LEAF_H__ #define __XFS_ATTR_LEAF_H__ -/* - * Attribute storage layout, internal structure, access macros, etc. - * - * Attribute lists are structured around Btrees where all the data - * elements are in the leaf nodes. Attribute names are hashed into an int, - * then that int is used as the index into the Btree. Since the hashval - * of an attribute name may not be unique, we may have duplicate keys. The - * internal links in the Btree are logical block offsets into the file. - */ - struct attrlist; struct attrlist_cursor_kern; -struct attrnames; -struct xfs_dabuf; +struct xfs_attr_list_context; struct xfs_da_args; struct xfs_da_state; struct xfs_da_state_blk; struct xfs_inode; struct xfs_trans; -/*======================================================================== - * Attribute structure when equal to XFS_LBSIZE(mp) bytes. - *========================================================================*/ - -/* - * This is the structure of the leaf nodes in the Btree. - * - * Struct leaf_entry's are packed from the top. Name/values grow from the - * bottom but are not packed. The freemap contains run-length-encoded entries - * for the free bytes after the leaf_entry's, but only the N largest such, - * smaller runs are dropped. When the freemap doesn't show enough space - * for an allocation, we compact the name/value area and try again. If we - * still don't have enough space, then we have to split the block. The - * name/value structs (both local and remote versions) must be 32bit aligned. - * - * Since we have duplicate hash keys, for each key that matches, compare - * the actual name string. The root and intermediate node search always - * takes the first-in-the-block key match found, so we should only have - * to work "forw"ard. If none matches, continue with the "forw"ard leaf - * nodes until the hash key changes or the attribute name is found. - * - * We store the fact that an attribute is a ROOT/USER/SECURE attribute in - * the leaf_entry. The namespaces are independent only because we also look - * at the namespace bit when we are looking for a matching attribute name. - * - * We also store an "incomplete" bit in the leaf_entry. It shows that an - * attribute is in the middle of being created and should not be shown to - * the user if we crash during the time that the bit is set. We clear the - * bit when we have finished setting up the attribute. We do this because - * we cannot create some large attributes inside a single transaction, and we - * need some indication that we weren't finished if we crash in the middle. - */ -#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ - -typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ - __be16 base; /* base of free region */ - __be16 size; /* length of free region */ -} xfs_attr_leaf_map_t; - -typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active leaf_entry's */ - __be16 usedbytes; /* num bytes of names/values stored */ - __be16 firstused; /* first used byte in name area */ - __u8 holes; /* != 0 if blk needs compaction */ - __u8 pad1; - xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; - /* N largest free regions */ -} xfs_attr_leaf_hdr_t; - -typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ - __be32 hashval; /* hash value of name */ - __be16 nameidx; /* index into buffer of name/value */ - __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ - __u8 pad2; /* unused pad byte */ -} xfs_attr_leaf_entry_t; - -typedef struct xfs_attr_leaf_name_local { - __be16 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ -} xfs_attr_leaf_name_local_t; - -typedef struct xfs_attr_leaf_name_remote { - __be32 valueblk; /* block number of value bytes */ - __be32 valuelen; /* number of bytes in value */ - __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ -} xfs_attr_leaf_name_remote_t; - -typedef struct xfs_attr_leafblock { - xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ - xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ - xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ -} xfs_attr_leafblock_t; - -/* - * Flags used in the leaf_entry[i].flags field. - * NOTE: the INCOMPLETE bit must not collide with the flags bits specified - * on the system call, they are "or"ed together for various operations. - */ -#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ -#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ -#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ -#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ -#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) -#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) -#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) -#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) - -/* - * Alignment for namelist and valuelist entries (since they are mixed - * there can be only one alignment value) - */ -#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) - -/* - * Cast typed pointers for "local" and "remote" name/value structs. - */ -#define XFS_ATTR_LEAF_NAME_REMOTE(leafp,idx) \ - xfs_attr_leaf_name_remote(leafp,idx) -static inline xfs_attr_leaf_name_remote_t * -xfs_attr_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_remote_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -#define XFS_ATTR_LEAF_NAME_LOCAL(leafp,idx) \ - xfs_attr_leaf_name_local(leafp,idx) -static inline xfs_attr_leaf_name_local_t * -xfs_attr_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) -{ - return (xfs_attr_leaf_name_local_t *) - &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -#define XFS_ATTR_LEAF_NAME(leafp,idx) \ - xfs_attr_leaf_name(leafp,idx) -static inline char *xfs_attr_leaf_name(xfs_attr_leafblock_t *leafp, int idx) -{ - return &((char *)leafp)[be16_to_cpu(leafp->entries[idx].nameidx)]; -} - -/* - * Calculate total bytes used (including trailing pad for alignment) for - * a "local" name/value structure, a "remote" name/value structure, and - * a pointer which might be either. - */ -#define XFS_ATTR_LEAF_ENTSIZE_REMOTE(nlen) \ - xfs_attr_leaf_entsize_remote(nlen) -static inline int xfs_attr_leaf_entsize_remote(int nlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -#define XFS_ATTR_LEAF_ENTSIZE_LOCAL(nlen,vlen) \ - xfs_attr_leaf_entsize_local(nlen,vlen) -static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) -{ - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); -} - -#define XFS_ATTR_LEAF_ENTSIZE_LOCAL_MAX(bsize) \ - xfs_attr_leaf_entsize_local_max(bsize) -static inline int xfs_attr_leaf_entsize_local_max(int bsize) -{ - return (((bsize) >> 1) + ((bsize) >> 2)); -} - - -/*======================================================================== - * Structure used to pass context around among the routines. - *========================================================================*/ - -typedef struct xfs_attr_list_context { - struct xfs_inode *dp; /* inode */ - struct attrlist_cursor_kern *cursor;/* position in list */ - struct attrlist *alist; /* output buffer */ - int count; /* num used entries */ - int dupcnt; /* count dup hashvals seen */ - int bufsize;/* total buffer size */ - int firstu; /* first used byte in buffer */ - int flags; /* from VOP call */ - int resynch;/* T/F: resynch with cursor */ -} xfs_attr_list_context_t; - /* * Used to keep a list of "remote value" extents when unlinking an inode. */ @@ -231,53 +51,58 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); -int xfs_attr_shortform_allfit(struct xfs_dabuf *bp, struct xfs_inode *dp); +int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). */ -int xfs_attr_leaf_to_node(struct xfs_da_args *args); -int xfs_attr_leaf_to_shortform(struct xfs_dabuf *bp, +int xfs_attr3_leaf_to_node(struct xfs_da_args *args); +int xfs_attr3_leaf_to_shortform(struct xfs_buf *bp, struct xfs_da_args *args, int forkoff); -int xfs_attr_leaf_clearflag(struct xfs_da_args *args); -int xfs_attr_leaf_setflag(struct xfs_da_args *args); -int xfs_attr_leaf_flipflags(xfs_da_args_t *args); +int xfs_attr3_leaf_clearflag(struct xfs_da_args *args); +int xfs_attr3_leaf_setflag(struct xfs_da_args *args); +int xfs_attr3_leaf_flipflags(struct xfs_da_args *args); /* * Routines used for growing the Btree. */ -int xfs_attr_leaf_split(struct xfs_da_state *state, +int xfs_attr3_leaf_split(struct xfs_da_state *state, struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); -int xfs_attr_leaf_lookup_int(struct xfs_dabuf *leaf, +int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, struct xfs_da_args *args); -int xfs_attr_leaf_getvalue(struct xfs_dabuf *bp, struct xfs_da_args *args); -int xfs_attr_leaf_add(struct xfs_dabuf *leaf_buffer, +int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_remove(struct xfs_dabuf *leaf_buffer, +int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr_leaf_list_int(struct xfs_dabuf *bp, +int xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* * Routines used for shrinking the Btree. */ -int xfs_attr_leaf_toosmall(struct xfs_da_state *state, int *retval); -void xfs_attr_leaf_unbalance(struct xfs_da_state *state, +int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval); +void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, struct xfs_da_state_blk *drop_blk, struct xfs_da_state_blk *save_blk); -int xfs_attr_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); +int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp); /* * Utility routines. */ -xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_dabuf *bp, int *count); -int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); -int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, - int *local); -int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp); +xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count); +int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); +int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp); +void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from); +void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c new file mode 100644 index 00000000000..90e2eeb2120 --- /dev/null +++ b/fs/xfs/xfs_attr_list.c @@ -0,0 +1,653 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_sf.h" +#include "xfs_attr_remote.h" +#include "xfs_attr_leaf.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" +#include "xfs_dir2.h" + +STATIC int +xfs_attr_shortform_compare(const void *a, const void *b) +{ + xfs_attr_sf_sort_t *sa, *sb; + + sa = (xfs_attr_sf_sort_t *)a; + sb = (xfs_attr_sf_sort_t *)b; + if (sa->hash < sb->hash) { + return(-1); + } else if (sa->hash > sb->hash) { + return(1); + } else { + return(sa->entno - sb->entno); + } +} + +#define XFS_ISRESET_CURSOR(cursor) \ + (!((cursor)->initted) && !((cursor)->hashval) && \ + !((cursor)->blkno) && !((cursor)->offset)) +/* + * Copy out entries of shortform attribute lists for attr_list(). + * Shortform attribute lists are not stored in hashval sorted order. + * If the output buffer is not large enough to hold them all, then we + * we have to calculate each entries' hashvalue and sort them before + * we can begin returning them to the user. + */ +int +xfs_attr_shortform_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_sf_sort_t *sbuf, *sbp; + xfs_attr_shortform_t *sf; + xfs_attr_sf_entry_t *sfe; + xfs_inode_t *dp; + int sbsize, nsbuf, count, i; + int error; + + ASSERT(context != NULL); + dp = context->dp; + ASSERT(dp != NULL); + ASSERT(dp->i_afp != NULL); + sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; + ASSERT(sf != NULL); + if (!sf->hdr.count) + return(0); + cursor = context->cursor; + ASSERT(cursor != NULL); + + trace_xfs_attr_list_sf(context); + + /* + * If the buffer is large enough and the cursor is at the start, + * do not bother with sorting since we will return everything in + * one buffer and another call using the cursor won't need to be + * made. + * Note the generous fudge factor of 16 overhead bytes per entry. + * If bufsize is zero then put_listent must be a search function + * and can just scan through what we have. + */ + if (context->bufsize == 0 || + (XFS_ISRESET_CURSOR(cursor) && + (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + error = context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen, + &sfe->nameval[sfe->namelen]); + + /* + * Either search callback finished early or + * didn't fit it all in the buffer after all. + */ + if (context->seen_enough) + break; + + if (error) + return error; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + } + trace_xfs_attr_list_sf_all(context); + return(0); + } + + /* do no more for a search callback */ + if (context->bufsize == 0) + return 0; + + /* + * It didn't all fit, so we have to sort everything on hashval. + */ + sbsize = sf->hdr.count * sizeof(*sbuf); + sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS); + + /* + * Scan the attribute list for the rest of the entries, storing + * the relevant info from only those that match into a buffer. + */ + nsbuf = 0; + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || + ((char *)sfe >= ((char *)sf + dp->i_afp->if_bytes)))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe); + kmem_free(sbuf); + return XFS_ERROR(EFSCORRUPTED); + } + + sbp->entno = i; + sbp->hash = xfs_da_hashname(sfe->nameval, sfe->namelen); + sbp->name = sfe->nameval; + sbp->namelen = sfe->namelen; + /* These are bytes, and both on-disk, don't endian-flip */ + sbp->valuelen = sfe->valuelen; + sbp->flags = sfe->flags; + sfe = XFS_ATTR_SF_NEXTENTRY(sfe); + sbp++; + nsbuf++; + } + + /* + * Sort the entries on hash then entno. + */ + xfs_sort(sbuf, nsbuf, sizeof(*sbuf), xfs_attr_shortform_compare); + + /* + * Re-find our place IN THE SORTED LIST. + */ + count = 0; + cursor->initted = 1; + cursor->blkno = 0; + for (sbp = sbuf, i = 0; i < nsbuf; i++, sbp++) { + if (sbp->hash == cursor->hashval) { + if (cursor->offset == count) { + break; + } + count++; + } else if (sbp->hash > cursor->hashval) { + break; + } + } + if (i == nsbuf) { + kmem_free(sbuf); + return(0); + } + + /* + * Loop putting entries into the user buffer. + */ + for ( ; i < nsbuf; i++, sbp++) { + if (cursor->hashval != sbp->hash) { + cursor->hashval = sbp->hash; + cursor->offset = 0; + } + error = context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen, + &sbp->name[sbp->namelen]); + if (error) + return error; + if (context->seen_enough) + break; + cursor->offset++; + } + + kmem_free(sbuf); + return(0); +} + +STATIC int +xfs_attr_node_list(xfs_attr_list_context_t *context) +{ + attrlist_cursor_kern_t *cursor; + xfs_attr_leafblock_t *leaf; + xfs_da_intnode_t *node; + struct xfs_attr3_icleaf_hdr leafhdr; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int error, i; + struct xfs_buf *bp; + struct xfs_inode *dp = context->dp; + + trace_xfs_attr_node_list(context); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Do all sorts of validation on the passed-in cursor structure. + * If anything is amiss, ignore the cursor and look up the hashval + * starting from the btree root. + */ + bp = NULL; + if (cursor->blkno > 0) { + error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, + &bp, XFS_ATTR_FORK); + if ((error != 0) && (error != EFSCORRUPTED)) + return(error); + if (bp) { + struct xfs_attr_leaf_entry *entries; + + node = bp->b_addr; + switch (be16_to_cpu(node->hdr.info.magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + if (cursor->hashval > be32_to_cpu( + entries[leafhdr.count - 1].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } else if (cursor->hashval <= be32_to_cpu( + entries[0].hashval)) { + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + break; + default: + trace_xfs_attr_list_wrong_blk(context); + xfs_trans_brelse(NULL, bp); + bp = NULL; + } + } + } + + /* + * We did not find what we expected given the cursor's contents, + * so we start from the top and work down based on the hash value. + * Note that start of node block is same as start of leaf block. + */ + if (bp == NULL) { + cursor->blkno = 0; + for (;;) { + __uint16_t magic; + + error = xfs_da3_node_read(NULL, dp, + cursor->blkno, -1, &bp, + XFS_ATTR_FORK); + if (error) + return(error); + node = bp->b_addr; + magic = be16_to_cpu(node->hdr.info.magic); + if (magic == XFS_ATTR_LEAF_MAGIC || + magic == XFS_ATTR3_LEAF_MAGIC) + break; + if (magic != XFS_DA_NODE_MAGIC && + magic != XFS_DA3_NODE_MAGIC) { + XFS_CORRUPTION_ERROR("xfs_attr_node_list(3)", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, + node); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(EFSCORRUPTED); + } + + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + for (i = 0; i < nodehdr.count; btree++, i++) { + if (cursor->hashval + <= be32_to_cpu(btree->hashval)) { + cursor->blkno = be32_to_cpu(btree->before); + trace_xfs_attr_list_node_descend(context, + btree); + break; + } + } + if (i == nodehdr.count) { + xfs_trans_brelse(NULL, bp); + return 0; + } + xfs_trans_brelse(NULL, bp); + } + } + ASSERT(bp != NULL); + + /* + * Roll upward through the blocks, processing each leaf block in + * order. As long as there is space in the result buffer, keep + * adding the information. + */ + for (;;) { + leaf = bp->b_addr; + error = xfs_attr3_leaf_list_int(bp, context); + if (error) { + xfs_trans_brelse(NULL, bp); + return error; + } + xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + if (context->seen_enough || leafhdr.forw == 0) + break; + cursor->blkno = leafhdr.forw; + xfs_trans_brelse(NULL, bp); + error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp); + if (error) + return error; + } + xfs_trans_brelse(NULL, bp); + return 0; +} + +/* + * Copy out attribute list entries for attr_list(), for leaf attribute lists. + */ +int +xfs_attr3_leaf_list_int( + struct xfs_buf *bp, + struct xfs_attr_list_context *context) +{ + struct attrlist_cursor_kern *cursor; + struct xfs_attr_leafblock *leaf; + struct xfs_attr3_icleaf_hdr ichdr; + struct xfs_attr_leaf_entry *entries; + struct xfs_attr_leaf_entry *entry; + int retval; + int i; + + trace_xfs_attr_list_leaf(context); + + leaf = bp->b_addr; + xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + entries = xfs_attr3_leaf_entryp(leaf); + + cursor = context->cursor; + cursor->initted = 1; + + /* + * Re-find our place in the leaf block if this is a new syscall. + */ + if (context->resynch) { + entry = &entries[0]; + for (i = 0; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) == cursor->hashval) { + if (cursor->offset == context->dupcnt) { + context->dupcnt = 0; + break; + } + context->dupcnt++; + } else if (be32_to_cpu(entry->hashval) > + cursor->hashval) { + context->dupcnt = 0; + break; + } + } + if (i == ichdr.count) { + trace_xfs_attr_list_notfound(context); + return 0; + } + } else { + entry = &entries[0]; + i = 0; + } + context->resynch = 0; + + /* + * We have found our place, start copying out the new attributes. + */ + retval = 0; + for (; i < ichdr.count; entry++, i++) { + if (be32_to_cpu(entry->hashval) != cursor->hashval) { + cursor->hashval = be32_to_cpu(entry->hashval); + cursor->offset = 0; + } + + if (entry->flags & XFS_ATTR_INCOMPLETE) + continue; /* skip incomplete entries */ + + if (entry->flags & XFS_ATTR_LOCAL) { + xfs_attr_leaf_name_local_t *name_loc = + xfs_attr3_leaf_name_local(leaf, i); + + retval = context->put_listent(context, + entry->flags, + name_loc->nameval, + (int)name_loc->namelen, + be16_to_cpu(name_loc->valuelen), + &name_loc->nameval[name_loc->namelen]); + if (retval) + return retval; + } else { + xfs_attr_leaf_name_remote_t *name_rmt = + xfs_attr3_leaf_name_remote(leaf, i); + + int valuelen = be32_to_cpu(name_rmt->valuelen); + + if (context->put_value) { + xfs_da_args_t args; + + memset((char *)&args, 0, sizeof(args)); + args.geo = context->dp->i_mount->m_attr_geo; + args.dp = context->dp; + args.whichfork = XFS_ATTR_FORK; + args.valuelen = valuelen; + args.rmtvaluelen = valuelen; + args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); + args.rmtblkno = be32_to_cpu(name_rmt->valueblk); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); + retval = xfs_attr_rmtval_get(&args); + if (retval) + return retval; + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + args.value); + kmem_free(args.value); + } else { + retval = context->put_listent(context, + entry->flags, + name_rmt->name, + (int)name_rmt->namelen, + valuelen, + NULL); + } + if (retval) + return retval; + } + if (context->seen_enough) + break; + cursor->offset++; + } + trace_xfs_attr_list_leaf_end(context); + return retval; +} + +/* + * Copy out attribute entries for attr_list(), for leaf attribute lists. + */ +STATIC int +xfs_attr_leaf_list(xfs_attr_list_context_t *context) +{ + int error; + struct xfs_buf *bp; + + trace_xfs_attr_leaf_list(context); + + context->cursor->blkno = 0; + error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); + if (error) + return XFS_ERROR(error); + + error = xfs_attr3_leaf_list_int(bp, context); + xfs_trans_brelse(NULL, bp); + return XFS_ERROR(error); +} + +int +xfs_attr_list_int( + xfs_attr_list_context_t *context) +{ + int error; + xfs_inode_t *dp = context->dp; + uint lock_mode; + + XFS_STATS_INC(xs_attr_list); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return EIO; + + /* + * Decide on what work routines to call based on the inode size. + */ + lock_mode = xfs_ilock_attr_map_shared(dp); + if (!xfs_inode_hasattr(dp)) { + error = 0; + } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { + error = xfs_attr_shortform_list(context); + } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) { + error = xfs_attr_leaf_list(context); + } else { + error = xfs_attr_node_list(context); + } + xfs_iunlock(dp, lock_mode); + return error; +} + +#define ATTR_ENTBASESIZE /* minimum bytes used by an attr */ \ + (((struct attrlist_ent *) 0)->a_name - (char *) 0) +#define ATTR_ENTSIZE(namelen) /* actual bytes used by an attr */ \ + ((ATTR_ENTBASESIZE + (namelen) + 1 + sizeof(u_int32_t)-1) \ + & ~(sizeof(u_int32_t)-1)) + +/* + * Format an attribute and copy it out to the user's buffer. + * Take care to check values and protect against them changing later, + * we may be reading them directly out of a user buffer. + */ +STATIC int +xfs_attr_put_listent( + xfs_attr_list_context_t *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + struct attrlist *alist = (struct attrlist *)context->alist; + attrlist_ent_t *aep; + int arraytop; + + ASSERT(!(context->flags & ATTR_KERNOVAL)); + ASSERT(context->count >= 0); + ASSERT(context->count < (ATTR_MAX_VALUELEN/8)); + ASSERT(context->firstu >= sizeof(*alist)); + ASSERT(context->firstu <= context->bufsize); + + /* + * Only list entries in the right namespace. + */ + if (((context->flags & ATTR_SECURE) == 0) != + ((flags & XFS_ATTR_SECURE) == 0)) + return 0; + if (((context->flags & ATTR_ROOT) == 0) != + ((flags & XFS_ATTR_ROOT) == 0)) + return 0; + + arraytop = sizeof(*alist) + + context->count * sizeof(alist->al_offset[0]); + context->firstu -= ATTR_ENTSIZE(namelen); + if (context->firstu < arraytop) { + trace_xfs_attr_list_full(context); + alist->al_more = 1; + context->seen_enough = 1; + return 1; + } + + aep = (attrlist_ent_t *)&context->alist[context->firstu]; + aep->a_valuelen = valuelen; + memcpy(aep->a_name, name, namelen); + aep->a_name[namelen] = 0; + alist->al_offset[context->count++] = context->firstu; + alist->al_count = context->count; + trace_xfs_attr_list_add(context); + return 0; +} + +/* + * Generate a list of extended attribute names and optionally + * also value lengths. Positive return value follows the XFS + * convention of being an error, zero or negative return code + * is the length of the buffer returned (negated), indicating + * success. + */ +int +xfs_attr_list( + xfs_inode_t *dp, + char *buffer, + int bufsize, + int flags, + attrlist_cursor_kern_t *cursor) +{ + xfs_attr_list_context_t context; + struct attrlist *alist; + int error; + + /* + * Validate the cursor. + */ + if (cursor->pad1 || cursor->pad2) + return(XFS_ERROR(EINVAL)); + if ((cursor->initted == 0) && + (cursor->hashval || cursor->blkno || cursor->offset)) + return XFS_ERROR(EINVAL); + + /* + * Check for a properly aligned buffer. + */ + if (((long)buffer) & (sizeof(int)-1)) + return XFS_ERROR(EFAULT); + if (flags & ATTR_KERNOVAL) + bufsize = 0; + + /* + * Initialize the output buffer. + */ + memset(&context, 0, sizeof(context)); + context.dp = dp; + context.cursor = cursor; + context.resynch = 1; + context.flags = flags; + context.alist = buffer; + context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */ + context.firstu = context.bufsize; + context.put_listent = xfs_attr_put_listent; + + alist = (struct attrlist *)context.alist; + alist->al_count = 0; + alist->al_more = 0; + alist->al_offset[0] = context.bufsize; + + error = xfs_attr_list_int(&context); + ASSERT(error >= 0); + return error; +} diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c new file mode 100644 index 00000000000..b5adfecbb8e --- /dev/null +++ b/fs/xfs/xfs_attr_remote.c @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_attr_remote.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" +#include "xfs_error.h" + +#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */ + +/* + * Each contiguous block has a header, so it is not just a simple attribute + * length to FSB conversion. + */ +int +xfs_attr3_rmt_blocks( + struct xfs_mount *mp, + int attrlen) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_attr3_rmt_verify( + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (rmt->rm_magic != cpu_to_be32(XFS_ATTR3_RMT_MAGIC)) + return false; + if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) + return false; + if (be32_to_cpu(rmt->rm_offset) + + be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + return false; + if (rmt->rm_owner == 0) + return false; + + return true; +} + +static void +xfs_attr3_rmt_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + + if (bp->b_error) + xfs_verifier_error(bp); + else + ASSERT(len == 0); +} + +static void +xfs_attr3_rmt_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; + int blksize = mp->m_attr_geo->blksize; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= blksize); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); + + len -= blksize; + ptr += blksize; + bno += BTOBB(blksize); + } + ASSERT(len == 0); +} + +const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .verify_read = xfs_attr3_rmt_read_verify, + .verify_write = xfs_attr3_rmt_write_verify, +}; + +STATIC int +xfs_attr3_rmt_hdr_set( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); + rmt->rm_offset = cpu_to_be32(offset); + rmt->rm_bytes = cpu_to_be32(size); + uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); + rmt->rm_owner = cpu_to_be64(ino); + rmt->rm_blkno = cpu_to_be64(bno); + + return sizeof(struct xfs_attr3_rmt_hdr); +} + +/* + * Helper functions to copy attribute data in and out of the one disk extents + */ +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **dst) +{ + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= blksize; + src += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + __uint8_t **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + int blksize = mp->m_attr_geo->blksize; + + ASSERT(len >= blksize); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < blksize) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == blksize); + memset(dst + hdr_size + byte_cnt, 0, + blksize - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= blksize; + dst += blksize; + bno += BTOBB(blksize); + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } +} + +/* + * Read the value associated with an attribute from the out-of-line buffer + * that we stored it in. + */ +int +xfs_attr_rmtval_get( + struct xfs_da_args *args) +{ + struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; + struct xfs_mount *mp = args->dp->i_mount; + struct xfs_buf *bp; + xfs_dablk_t lblkno = args->rmtblkno; + __uint8_t *dst = args->value; + int valuelen; + int nmap; + int error; + int blkcnt = args->rmtblkcnt; + int i; + int offset = 0; + + trace_xfs_attr_rmtval_get(args); + + ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + nmap = ATTR_RMTVALUE_MAPSIZE; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return error; + ASSERT(nmap >= 1); + + for (i = 0; (i < nmap) && (valuelen > 0); i++) { + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && + (map[i].br_startblock != HOLESTARTBLOCK)); + dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + dblkno, dblkcnt, 0, &bp, + &xfs_attr3_rmt_buf_ops); + if (error) + return error; + + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); + xfs_buf_relse(bp); + if (error) + return error; + + /* roll attribute extent map forwards */ + lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; + } + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Write the value associated with an attribute into the out-of-line buffer + * that we have defined for it. + */ +int +xfs_attr_rmtval_set( + struct xfs_da_args *args) +{ + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_bmbt_irec map; + xfs_dablk_t lblkno; + xfs_fileoff_t lfileoff = 0; + __uint8_t *src = args->value; + int blkcnt; + int valuelen; + int nmap; + int error; + int offset = 0; + + trace_xfs_attr_rmtval_set(args); + + /* + * Find a "hole" in the attribute address space large enough for + * us to drop the new attribute's value into. Because CRC enable + * attributes have headers, we can't just do a straight byte to FSB + * conversion and have to take the header space into account. + */ + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); + error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, + XFS_ATTR_FORK); + if (error) + return error; + + args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; + args->rmtblkcnt = blkcnt; + + /* + * Roll through the "value", allocating blocks on disk as required. + */ + while (blkcnt > 0) { + int committed; + + /* + * Allocate a single extent, up to the size of the value. + */ + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, + blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + args->firstblock, args->total, &map, &nmap, + args->flist); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return(error); + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, dp, 0); + + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + + /* + * Start the next trans in the chain. + */ + error = xfs_trans_roll(&args->trans, dp); + if (error) + return (error); + } + + /* + * Roll through the "value", copying the attribute value to the + * already-allocated blocks. Blocks are written synchronously + * so that we can know they are all on disk before we turn off + * the INCOMPLETE flag. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + valuelen = args->rmtvaluelen; + while (valuelen > 0) { + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); + + xfs_bmap_init(args->flist, args->firstblock); + nmap = 1; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, + XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); + if (!bp) + return ENOMEM; + bp->b_ops = &xfs_attr3_rmt_buf_ops; + + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); + + error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ + xfs_buf_relse(bp); + if (error) + return error; + + + /* roll attribute extent map forwards */ + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + ASSERT(valuelen == 0); + return 0; +} + +/* + * Remove the value associated with an attribute by deleting the + * out-of-line buffer that it is stored on. + */ +int +xfs_attr_rmtval_remove( + struct xfs_da_args *args) +{ + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; + + trace_xfs_attr_rmtval_remove(args); + + /* + * Roll through the "value", invalidating the attribute value's blocks. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + + /* + * Try to remember where we decided to put the value. + */ + nmap = 1; + error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); + if (error) + return(error); + ASSERT(nmap == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + + /* + * If the "remote" value is in the cache, remove it. + */ + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + if (bp) { + xfs_buf_stale(bp); + xfs_buf_relse(bp); + bp = NULL; + } + + lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; + } + + /* + * Keep de-allocating extents until the remote-value region is gone. + */ + lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; + done = 0; + while (!done) { + int committed; + + xfs_bmap_init(args->flist, args->firstblock); + error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, + XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, + 1, args->firstblock, args->flist, + &done); + if (!error) { + error = xfs_bmap_finish(&args->trans, args->flist, + &committed); + } + if (error) { + ASSERT(committed); + args->trans = NULL; + xfs_bmap_cancel(args->flist); + return error; + } + + /* + * bmap_finish() may have committed the last trans and started + * a new one. We need the inode to be in all transactions. + */ + if (committed) + xfs_trans_ijoin(args->trans, args->dp, 0); + + /* + * Close out trans and start the next one in the chain. + */ + error = xfs_trans_roll(&args->trans, args->dp); + if (error) + return (error); + } + return(0); +} diff --git a/fs/xfs/linux-2.6/mutex.h b/fs/xfs/xfs_attr_remote.h index 2a88d56c4dc..5a9acfa156d 100644 --- a/fs/xfs/linux-2.6/mutex.h +++ b/fs/xfs/xfs_attr_remote.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -15,11 +15,13 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef __XFS_SUPPORT_MUTEX_H__ -#define __XFS_SUPPORT_MUTEX_H__ +#ifndef __XFS_ATTR_REMOTE_H__ +#define __XFS_ATTR_REMOTE_H__ -#include <linux/mutex.h> +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); -typedef struct mutex mutex_t; +int xfs_attr_rmtval_get(struct xfs_da_args *args); +int xfs_attr_rmtval_set(struct xfs_da_args *args); +int xfs_attr_rmtval_remove(struct xfs_da_args *args); -#endif /* __XFS_SUPPORT_MUTEX_H__ */ +#endif /* __XFS_ATTR_REMOTE_H__ */ diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h index f67f917803b..919756e3ba5 100644 --- a/fs/xfs/xfs_attr_sf.h +++ b/fs/xfs/xfs_attr_sf.h @@ -25,8 +25,6 @@ * to fit into the literal area of the inode. */ -struct xfs_inode; - /* * Entries are packed toward the top as tight as possible. */ @@ -54,7 +52,7 @@ typedef struct xfs_attr_sf_sort { __uint8_t valuelen; /* length of value */ __uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ xfs_dahash_t hash; /* this entry's hash value */ - char *name; /* name value, pointer into buffer */ + unsigned char *name; /* name value, pointer into buffer */ } xfs_attr_sf_sort_t; #define XFS_ATTR_SF_ENTSIZE_BYNAME(nlen,vlen) /* space name/value uses */ \ @@ -69,46 +67,4 @@ typedef struct xfs_attr_sf_sort { (be16_to_cpu(((xfs_attr_shortform_t *) \ ((dp)->i_afp->if_u1.if_data))->hdr.totsize)) -#if defined(XFS_ATTR_TRACE) -/* - * Kernel tracing support for attribute lists - */ -struct xfs_attr_list_context; -struct xfs_da_intnode; -struct xfs_da_node_entry; -struct xfs_attr_leafblock; - -#define XFS_ATTR_TRACE_SIZE 4096 /* size of global trace buffer */ -extern ktrace_t *xfs_attr_trace_buf; - -/* - * Trace record types. - */ -#define XFS_ATTR_KTRACE_L_C 1 /* context */ -#define XFS_ATTR_KTRACE_L_CN 2 /* context, node */ -#define XFS_ATTR_KTRACE_L_CB 3 /* context, btree */ -#define XFS_ATTR_KTRACE_L_CL 4 /* context, leaf */ - -void xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context); -void xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context, - struct xfs_da_intnode *node); -void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context, - struct xfs_da_node_entry *btree); -void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context, - struct xfs_attr_leafblock *leaf); -void xfs_attr_trace_enter(int type, char *where, - __psunsigned_t a2, __psunsigned_t a3, - __psunsigned_t a4, __psunsigned_t a5, - __psunsigned_t a6, __psunsigned_t a7, - __psunsigned_t a8, __psunsigned_t a9, - __psunsigned_t a10, __psunsigned_t a11, - __psunsigned_t a12, __psunsigned_t a13, - __psunsigned_t a14, __psunsigned_t a15); -#else -#define xfs_attr_trace_l_c(w,c) -#define xfs_attr_trace_l_cn(w,c,n) -#define xfs_attr_trace_l_cb(w,c,b) -#define xfs_attr_trace_l_cl(w,c,l) -#endif /* XFS_ATTR_TRACE */ - #endif /* __XFS_ATTR_SF_H__ */ diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c deleted file mode 100644 index f4fe3715a80..00000000000 --- a/fs/xfs/xfs_behavior.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" - -/* - * Source file used to associate/disassociate behaviors with virtualized - * objects. See xfs_behavior.h for more information about behaviors, etc. - * - * The implementation is split between functions in this file and macros - * in xfs_behavior.h. - */ - -/* - * Insert a new behavior descriptor into a behavior chain. - * - * The behavior chain is ordered based on the 'position' number which - * lives in the first field of the ops vector (higher numbers first). - * - * Attempts to insert duplicate ops result in an EINVAL return code. - * Otherwise, return 0 to indicate success. - */ -int -bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - int position; - - /* - * Validate the position value of the new behavior. - */ - position = BHV_POSITION(bdp); - ASSERT(position >= BHV_POSITION_BASE && position <= BHV_POSITION_TOP); - - /* - * Find location to insert behavior. Check for duplicates. - */ - prev = NULL; - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - /* Check for duplication. */ - if (curdesc->bd_ops == bdp->bd_ops) { - ASSERT(0); - return EINVAL; - } - - /* Find correct position */ - if (position >= BHV_POSITION(curdesc)) { - ASSERT(position != BHV_POSITION(curdesc)); - break; /* found it */ - } - - prev = curdesc; - } - - if (prev == NULL) { - /* insert at front of chain */ - bdp->bd_next = bhp->bh_first; - bhp->bh_first = bdp; - } else { - /* insert after prev */ - bdp->bd_next = prev->bd_next; - prev->bd_next = bdp; - } - - return 0; -} - -/* - * Remove a behavior descriptor from a position in a behavior chain; - * the position is guaranteed not to be the first position. - * Should only be called by the bhv_remove() macro. - */ -void -bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp) -{ - bhv_desc_t *curdesc, *prev; - - ASSERT(bhp->bh_first != NULL); - ASSERT(bhp->bh_first->bd_next != NULL); - - prev = bhp->bh_first; - for (curdesc = bhp->bh_first->bd_next; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc == bdp) - break; /* found it */ - prev = curdesc; - } - - ASSERT(curdesc == bdp); - prev->bd_next = bdp->bd_next; /* remove from after prev */ -} - -/* - * Look for a specific ops vector on the specified behavior chain. - * Return the associated behavior descriptor. Or NULL, if not found. - */ -bhv_desc_t * -bhv_lookup(bhv_head_t *bhp, void *ops) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_ops == ops) - return curdesc; - } - - return NULL; -} - -/* - * Looks for the first behavior within a specified range of positions. - * Return the associated behavior descriptor. Or NULL, if none found. - */ -bhv_desc_t * -bhv_lookup_range(bhv_head_t *bhp, int low, int high) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - int position = BHV_POSITION(curdesc); - - if (position <= high) { - if (position >= low) - return curdesc; - return NULL; - } - } - - return NULL; -} - -/* - * Return the base behavior in the chain, or NULL if the chain - * is empty. - * - * The caller has not read locked the behavior chain, so acquire the - * lock before traversing the chain. - */ -bhv_desc_t * -bhv_base(bhv_head_t *bhp) -{ - bhv_desc_t *curdesc; - - for (curdesc = bhp->bh_first; - curdesc != NULL; - curdesc = curdesc->bd_next) { - - if (curdesc->bd_next == NULL) { - return curdesc; - } - } - - return NULL; -} - -void -bhv_head_init( - bhv_head_t *bhp, - char *name) -{ - bhp->bh_first = NULL; -} - -void -bhv_insert_initial( - bhv_head_t *bhp, - bhv_desc_t *bdp) -{ - ASSERT(bhp->bh_first == NULL); - (bhp)->bh_first = bdp; -} - -void -bhv_head_destroy( - bhv_head_t *bhp) -{ - ASSERT(bhp->bh_first == NULL); -} diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h deleted file mode 100644 index 6e6e56fb352..00000000000 --- a/fs/xfs/xfs_behavior.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_BEHAVIOR_H__ -#define __XFS_BEHAVIOR_H__ - -/* - * Header file used to associate behaviors with virtualized objects. - * - * A virtualized object is an internal, virtualized representation of - * OS entities such as persistent files, processes, or sockets. Examples - * of virtualized objects include vnodes, vprocs, and vsockets. Often - * a virtualized object is referred to simply as an "object." - * - * A behavior is essentially an implementation layer associated with - * an object. Multiple behaviors for an object are chained together, - * the order of chaining determining the order of invocation. Each - * behavior of a given object implements the same set of interfaces - * (e.g., the VOP interfaces). - * - * Behaviors may be dynamically inserted into an object's behavior chain, - * such that the addition is transparent to consumers that already have - * references to the object. Typically, a given behavior will be inserted - * at a particular location in the behavior chain. Insertion of new - * behaviors is synchronized with operations-in-progress (oip's) so that - * the oip's always see a consistent view of the chain. - * - * The term "interposition" is used to refer to the act of inserting - * a behavior such that it interposes on (i.e., is inserted in front - * of) a particular other behavior. A key example of this is when a - * system implementing distributed single system image wishes to - * interpose a distribution layer (providing distributed coherency) - * in front of an object that is otherwise only accessed locally. - * - * Note that the traditional vnode/inode combination is simply a virtualized - * object that has exactly one associated behavior. - * - * Behavior synchronization is logic which is necessary under certain - * circumstances that there is no conflict between ongoing operations - * traversing the behavior chain and those dynamically modifying the - * behavior chain. Because behavior synchronization adds extra overhead - * to virtual operation invocation, we want to restrict, as much as - * we can, the requirement for this extra code, to those situations - * in which it is truly necessary. - * - * Behavior synchronization is needed whenever there's at least one class - * of object in the system for which: - * 1) multiple behaviors for a given object are supported, - * -- AND -- - * 2a) insertion of a new behavior can happen dynamically at any time during - * the life of an active object, - * -- AND -- - * 3a) insertion of a new behavior needs to synchronize with existing - * ops-in-progress. - * -- OR -- - * 3b) multiple different behaviors can be dynamically inserted at - * any time during the life of an active object - * -- OR -- - * 3c) removal of a behavior can occur at any time during the life of - * an active object. - * -- OR -- - * 2b) removal of a behavior can occur at any time during the life of an - * active object - * - */ - -/* - * Behavior head. Head of the chain of behaviors. - * Contained within each virtualized object data structure. - */ -typedef struct bhv_head { - struct bhv_desc *bh_first; /* first behavior in chain */ -} bhv_head_t; - -/* - * Behavior descriptor. Descriptor associated with each behavior. - * Contained within the behavior's private data structure. - */ -typedef struct bhv_desc { - void *bd_pdata; /* private data for this behavior */ - void *bd_vobj; /* virtual object associated with */ - void *bd_ops; /* ops for this behavior */ - struct bhv_desc *bd_next; /* next behavior in chain */ -} bhv_desc_t; - -/* - * Behavior identity field. A behavior's identity determines the position - * where it lives within a behavior chain, and it's always the first field - * of the behavior's ops vector. The optional id field further identifies the - * subsystem responsible for the behavior. - */ -typedef struct bhv_identity { - __u16 bi_id; /* owning subsystem id */ - __u16 bi_position; /* position in chain */ -} bhv_identity_t; - -typedef bhv_identity_t bhv_position_t; - -#define BHV_IDENTITY_INIT(id,pos) {id, pos} -#define BHV_IDENTITY_INIT_POSITION(pos) BHV_IDENTITY_INIT(0, pos) - -/* - * Define boundaries of position values. - */ -#define BHV_POSITION_INVALID 0 /* invalid position number */ -#define BHV_POSITION_BASE 1 /* base (last) implementation layer */ -#define BHV_POSITION_TOP 63 /* top (first) implementation layer */ - -/* - * Plumbing macros. - */ -#define BHV_HEAD_FIRST(bhp) (ASSERT((bhp)->bh_first), (bhp)->bh_first) -#define BHV_NEXT(bdp) (ASSERT((bdp)->bd_next), (bdp)->bd_next) -#define BHV_NEXTNULL(bdp) ((bdp)->bd_next) -#define BHV_VOBJ(bdp) (ASSERT((bdp)->bd_vobj), (bdp)->bd_vobj) -#define BHV_VOBJNULL(bdp) ((bdp)->bd_vobj) -#define BHV_PDATA(bdp) (bdp)->bd_pdata -#define BHV_OPS(bdp) (bdp)->bd_ops -#define BHV_IDENTITY(bdp) ((bhv_identity_t *)(bdp)->bd_ops) -#define BHV_POSITION(bdp) (BHV_IDENTITY(bdp)->bi_position) - -extern void bhv_head_init(bhv_head_t *, char *); -extern void bhv_head_destroy(bhv_head_t *); -extern int bhv_insert(bhv_head_t *, bhv_desc_t *); -extern void bhv_insert_initial(bhv_head_t *, bhv_desc_t *); - -/* - * Initialize a new behavior descriptor. - * Arguments: - * bdp - pointer to behavior descriptor - * pdata - pointer to behavior's private data - * vobj - pointer to associated virtual object - * ops - pointer to ops for this behavior - */ -#define bhv_desc_init(bdp, pdata, vobj, ops) \ - { \ - (bdp)->bd_pdata = pdata; \ - (bdp)->bd_vobj = vobj; \ - (bdp)->bd_ops = ops; \ - (bdp)->bd_next = NULL; \ - } - -/* - * Remove a behavior descriptor from a behavior chain. - */ -#define bhv_remove(bhp, bdp) \ - { \ - if ((bhp)->bh_first == (bdp)) { \ - /* \ - * Remove from front of chain. \ - * Atomic wrt oip's. \ - */ \ - (bhp)->bh_first = (bdp)->bd_next; \ - } else { \ - /* remove from non-front of chain */ \ - bhv_remove_not_first(bhp, bdp); \ - } \ - (bdp)->bd_vobj = NULL; \ - } - -/* - * Behavior module prototypes. - */ -extern void bhv_remove_not_first(bhv_head_t *bhp, bhv_desc_t *bdp); -extern bhv_desc_t * bhv_lookup(bhv_head_t *bhp, void *ops); -extern bhv_desc_t * bhv_lookup_range(bhv_head_t *bhp, int low, int high); -extern bhv_desc_t * bhv_base(bhv_head_t *bhp); - -/* No bhv locking on Linux */ -#define bhv_lookup_unlocked bhv_lookup -#define bhv_base_unlocked bhv_base - -#endif /* __XFS_BEHAVIOR_H__ */ diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c index 43be6a7e47c..0e8885a5964 100644 --- a/fs/xfs/xfs_bit.c +++ b/fs/xfs/xfs_bit.c @@ -16,207 +16,29 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_log_format.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" /* * XFS bit manipulation routines, used in non-realtime code. */ -#ifndef HAVE_ARCH_HIGHBIT /* - * Index of high bit number in byte, -1 for none set, 0..7 otherwise. - */ -STATIC const char xfs_highbit[256] = { - -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */ - 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */ - 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */ - 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */ - 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */ - 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */ -}; -#endif - -/* - * Count of bits set in byte, 0..8. - */ -static const char xfs_countbit[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, /* 00 .. 07 */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 08 .. 0f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 10 .. 17 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 18 .. 1f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 20 .. 27 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 28 .. 2f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 30 .. 37 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 38 .. 3f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 40 .. 47 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 48 .. 4f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 50 .. 57 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 58 .. 5f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 60 .. 67 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 68 .. 6f */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 70 .. 77 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* 78 .. 7f */ - 1, 2, 2, 3, 2, 3, 3, 4, /* 80 .. 87 */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 88 .. 8f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* 90 .. 97 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* 98 .. 9f */ - 2, 3, 3, 4, 3, 4, 4, 5, /* a0 .. a7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* a8 .. af */ - 3, 4, 4, 5, 4, 5, 5, 6, /* b0 .. b7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* b8 .. bf */ - 2, 3, 3, 4, 3, 4, 4, 5, /* c0 .. c7 */ - 3, 4, 4, 5, 4, 5, 5, 6, /* c8 .. cf */ - 3, 4, 4, 5, 4, 5, 5, 6, /* d0 .. d7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* d8 .. df */ - 3, 4, 4, 5, 4, 5, 5, 6, /* e0 .. e7 */ - 4, 5, 5, 6, 5, 6, 6, 7, /* e8 .. ef */ - 4, 5, 5, 6, 5, 6, 6, 7, /* f0 .. f7 */ - 5, 6, 6, 7, 6, 7, 7, 8, /* f8 .. ff */ -}; - -/* - * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set. - */ -inline int -xfs_highbit32( - __uint32_t v) -{ -#ifdef HAVE_ARCH_HIGHBIT - return highbit32(v); -#else - int i; - - if (v & 0xffff0000) - if (v & 0xff000000) - i = 24; - else - i = 16; - else if (v & 0x0000ffff) - if (v & 0x0000ff00) - i = 8; - else - i = 0; - else - return -1; - return i + xfs_highbit[(v >> i) & 0xff]; -#endif -} - -/* - * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_lowbit64( - __uint64_t v) -{ - __uint32_t w = (__uint32_t)v; - int n = 0; - - if (w) { /* lower bits */ - n = ffs(w); - } else { /* upper bits */ - w = (__uint32_t)(v >> 32); - if (w && (n = ffs(w))) - n += 32; - } - return n - 1; -} - -/* - * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set. - */ -int -xfs_highbit64( - __uint64_t v) -{ - __uint32_t h = (__uint32_t)(v >> 32); - - if (h) - return xfs_highbit32(h) + 32; - return xfs_highbit32((__uint32_t)v); -} - - -/* - * Count the number of bits set in the bitmap starting with bit - * start_bit. Size is the size of the bitmap in words. - * - * Do the counting by mapping a byte value to the number of set - * bits for that value using the xfs_countbit array, i.e. - * xfs_countbit[0] == 0, xfs_countbit[1] == 1, xfs_countbit[2] == 1, - * xfs_countbit[3] == 2, etc. + * Return whether bitmap is empty. + * Size is number of words in the bitmap, which is padded to word boundary + * Returns 1 for empty, 0 for non-empty. */ int -xfs_count_bits(uint *map, uint size, uint start_bit) +xfs_bitmap_empty(uint *map, uint size) { - register int bits; - register unsigned char *bytep; - register unsigned char *end_map; - int byte_bit; - - bits = 0; - end_map = (char*)(map + size); - bytep = (char*)(map + (start_bit & ~0x7)); - byte_bit = start_bit & 0x7; - - /* - * If the caller fell off the end of the map, return 0. - */ - if (bytep >= end_map) { - return (0); - } - - /* - * If start_bit is not byte aligned, then process the - * first byte separately. - */ - if (byte_bit != 0) { - /* - * Shift off the bits we don't want to look at, - * before indexing into xfs_countbit. - */ - bits += xfs_countbit[(*bytep >> byte_bit)]; - bytep++; - } + uint i; + uint ret = 0; - /* - * Count the bits in each byte until the end of the bitmap. - */ - while (bytep < end_map) { - bits += xfs_countbit[*bytep]; - bytep++; + for (i = 0; i < size; i++) { + ret |= map[i]; } - return (bits); + return (ret == 0); } /* diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h index 0bbe5681754..e1649c0d3e0 100644 --- a/fs/xfs/xfs_bit.h +++ b/fs/xfs/xfs_bit.h @@ -23,40 +23,60 @@ */ /* - * masks with n high/low bits set, 32-bit values & 64-bit values + * masks with n high/low bits set, 64-bit values */ -#define XFS_MASK32HI(n) xfs_mask32hi(n) -static inline __uint32_t xfs_mask32hi(int n) -{ - return (__uint32_t)-1 << (32 - (n)); -} -#define XFS_MASK64HI(n) xfs_mask64hi(n) static inline __uint64_t xfs_mask64hi(int n) { return (__uint64_t)-1 << (64 - (n)); } -#define XFS_MASK32LO(n) xfs_mask32lo(n) static inline __uint32_t xfs_mask32lo(int n) { return ((__uint32_t)1 << (n)) - 1; } -#define XFS_MASK64LO(n) xfs_mask64lo(n) static inline __uint64_t xfs_mask64lo(int n) { return ((__uint64_t)1 << (n)) - 1; } /* Get high bit set out of 32-bit argument, -1 if none set */ -extern int xfs_highbit32(__uint32_t v); +static inline int xfs_highbit32(__uint32_t v) +{ + return fls(v) - 1; +} + +/* Get high bit set out of 64-bit argument, -1 if none set */ +static inline int xfs_highbit64(__uint64_t v) +{ + return fls64(v) - 1; +} + +/* Get low bit set out of 32-bit argument, -1 if none set */ +static inline int xfs_lowbit32(__uint32_t v) +{ + return ffs(v) - 1; +} /* Get low bit set out of 64-bit argument, -1 if none set */ -extern int xfs_lowbit64(__uint64_t v); +static inline int xfs_lowbit64(__uint64_t v) +{ + __uint32_t w = (__uint32_t)v; + int n = 0; -/* Get high bit set out of 64-bit argument, -1 if none set */ -extern int xfs_highbit64(__uint64_t); + if (w) { /* lower bits */ + n = ffs(w); + } else { /* upper bits */ + w = (__uint32_t)(v >> 32); + if (w) { + n = ffs(w); + if (n) + n += 32; + } + } + return n - 1; +} -/* Count set bits in map starting with start_bit */ -extern int xfs_count_bits(uint *map, uint size, uint start_bit); +/* Return whether bitmap is empty (1 == empty) */ +extern int xfs_bitmap_empty(uint *map, uint size); /* Count continuous one bits in map starting with start_bit */ extern int xfs_contig_bits(uint *map, uint size, uint start_bit); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index bf46fae303a..75c3fe5f3d9 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -17,210 +17,727 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_dir2.h" #include "xfs_inode.h" #include "xfs_btree.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_alloc.h" #include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" #include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_attr_leaf.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_trans_space.h" #include "xfs_buf_item.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_attr_leaf.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" -#ifdef DEBUG -STATIC void -xfs_bmap_check_leaf_extents(xfs_btree_cur_t *cur, xfs_inode_t *ip, int whichfork); -#endif - kmem_zone_t *xfs_bmap_free_item_zone; /* - * Prototypes for internal bmap routines. + * Miscellaneous helper functions + */ + +/* + * Compute and fill in the value of the maximum depth of a bmap btree + * in this filesystem. Done once, during mount. */ +void +xfs_bmap_compute_maxlevels( + xfs_mount_t *mp, /* file system mount structure */ + int whichfork) /* data or attr fork */ +{ + int level; /* btree level */ + uint maxblocks; /* max blocks at this level */ + uint maxleafents; /* max leaf entries possible */ + int maxrootrecs; /* max records in root block */ + int minleafrecs; /* min records in leaf block */ + int minnoderecs; /* min records in node block */ + int sz; /* root block size */ + + /* + * The maximum number of extents in a file, hence the maximum + * number of leaf entries, is controlled by the type of di_nextents + * (a signed 32-bit number, xfs_extnum_t), or by di_anextents + * (a signed 16-bit number, xfs_aextnum_t). + * + * Note that we can no longer assume that if we are in ATTR1 that + * the fork offset of all the inodes will be + * (xfs_default_attroffset(ip) >> 3) because we could have mounted + * with ATTR2 and then mounted back with ATTR1, keeping the + * di_forkoff's fixed but probably at various positions. Therefore, + * for both ATTR1 and ATTR2 we have to assume the worst case scenario + * of a minimum size available. + */ + if (whichfork == XFS_DATA_FORK) { + maxleafents = MAXEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS); + } else { + maxleafents = MAXAEXTNUM; + sz = XFS_BMDR_SPACE_CALC(MINABTPTRS); + } + maxrootrecs = xfs_bmdr_maxrecs(sz, 0); + minleafrecs = mp->m_bmap_dmnr[0]; + minnoderecs = mp->m_bmap_dmnr[1]; + maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; + for (level = 1; maxblocks > 1; level++) { + if (maxblocks <= maxrootrecs) + maxblocks = 1; + else + maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; + } + mp->m_bm_maxlevels[whichfork] = level; +} +STATIC int /* error */ +xfs_bmbt_lookup_eq( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + +STATIC int /* error */ +xfs_bmbt_lookup_ge( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + int *stat) /* success/failure */ +{ + cur->bc_rec.b.br_startoff = off; + cur->bc_rec.b.br_startblock = bno; + cur->bc_rec.b.br_blockcount = len; + return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); +} /* - * Called from xfs_bmap_add_attrfork to handle extents format files. + * Check if the inode needs to be converted to btree format. */ -STATIC int /* error */ -xfs_bmap_add_attrfork_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ +static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork); +} /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Check if the inode should be converted to extent format. */ -STATIC int /* error */ -xfs_bmap_add_attrfork_local( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated */ - xfs_bmap_free_t *flist, /* blocks to free at commit */ - int *flags); /* inode logging flags */ +static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork) +{ + return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && + XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork); +} /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). + * Update the record referred to by cur to the value given + * by [off, bno, len, state]. + * This either works (return 0) or gets an EFSCORRUPTED error. */ -STATIC int /* error */ -xfs_bmap_add_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork, /* data or attr fork */ - int rsvd); /* OK to allocate reserved blocks */ +STATIC int +xfs_bmbt_update( + struct xfs_btree_cur *cur, + xfs_fileoff_t off, + xfs_fsblock_t bno, + xfs_filblks_t len, + xfs_exntst_t state) +{ + union xfs_btree_rec rec; + + xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state); + return xfs_btree_update(cur, &rec); +} /* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Compute the worst-case number of indirect blocks that will be used + * for ip's delayed extent of length "len". */ -STATIC int /* error */ -xfs_bmap_add_extent_delay_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int rsvd); /* OK to allocate reserved blocks */ +STATIC xfs_filblks_t +xfs_bmap_worst_indlen( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ +{ + int level; /* btree level number */ + int maxrecs; /* maximum record count at this level */ + xfs_mount_t *mp; /* mount structure */ + xfs_filblks_t rval; /* return value */ + + mp = ip->i_mount; + maxrecs = mp->m_bmap_dmxr[0]; + for (level = 0, rval = 0; + level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); + level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + rval += len; + if (len == 1) + return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + level - 1; + if (level == 0) + maxrecs = mp->m_bmap_dmxr[1]; + } + return rval; +} /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Calculate the default attribute fork offset for newly created inodes. */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int rsvd); /* OK to allocate reserved blocks */ +uint +xfs_default_attroffset( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint offset; + + if (mp->m_sb.sb_inodesize == 256) { + offset = XFS_LITINO(mp, ip->i_d.di_version) - + XFS_BMDR_SPACE_CALC(MINABTPTRS); + } else { + offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); + } + + ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version)); + return offset; +} /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Helper routine to reset inode di_forkoff field when switching + * attribute fork from local to extent format - we reset it where + * possible to make space available for inline data fork extents. */ -STATIC int /* error */ -xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork); /* data or attr fork */ +STATIC void +xfs_bmap_forkoff_reset( + xfs_inode_t *ip, + int whichfork) +{ + if (whichfork == XFS_ATTR_FORK && + ip->i_d.di_format != XFS_DINODE_FMT_DEV && + ip->i_d.di_format != XFS_DINODE_FMT_UUID && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { + uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; + + if (dfl_forkoff > ip->i_d.di_forkoff) + ip->i_d.di_forkoff = dfl_forkoff; + } +} /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Debug/sanity checking code */ -STATIC int /* error */ -xfs_bmap_add_extent_unwritten_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta); /* Change made to incore extents */ + +STATIC int +xfs_bmap_sanity_check( + struct xfs_mount *mp, + struct xfs_buf *bp, + int level) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && + block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) + return 0; + + if (be16_to_cpu(block->bb_level) != level || + be16_to_cpu(block->bb_numrecs) == 0 || + be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return 0; + + return 1; +} + +#ifdef DEBUG +STATIC struct xfs_buf * +xfs_bmap_get_bp( + struct xfs_btree_cur *cur, + xfs_fsblock_t bno) +{ + struct xfs_log_item_desc *lidp; + int i; + + if (!cur) + return NULL; + + for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { + if (!cur->bc_bufs[i]) + break; + if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + return cur->bc_bufs[i]; + } + + /* Chase down all the log items to see if the bp is there */ + list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { + struct xfs_buf_log_item *bip; + bip = (struct xfs_buf_log_item *)lidp->lid_item; + if (bip->bli_item.li_type == XFS_LI_BUF && + XFS_BUF_ADDR(bip->bli_buf) == bno) + return bip->bli_buf; + } + + return NULL; +} + +STATIC void +xfs_check_block( + struct xfs_btree_block *block, + xfs_mount_t *mp, + int root, + short sz) +{ + int i, j, dmxr; + __be64 *pp, *thispa; /* pointer to block address */ + xfs_bmbt_key_t *prevp, *keyp; + + ASSERT(be16_to_cpu(block->bb_level) > 0); + + prevp = NULL; + for( i = 1; i <= xfs_btree_get_numrecs(block); i++) { + dmxr = mp->m_bmap_dmxr[0]; + keyp = XFS_BMBT_KEY_ADDR(mp, block, i); + + if (prevp) { + ASSERT(be64_to_cpu(prevp->br_startoff) < + be64_to_cpu(keyp->br_startoff)); + } + prevp = keyp; + + /* + * Compare the block numbers to see if there are dups. + */ + if (root) + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz); + else + pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr); + + for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { + if (root) + thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz); + else + thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); + if (*thispa == *pp) { + xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", + __func__, j, i, + (unsigned long long)be64_to_cpu(*thispa)); + panic("%s: ptrs are equal in node\n", + __func__); + } + } + } +} /* - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. - * It figures out where to ask the underlying allocator to put the new extent. + * Check that the extents for the inode ip are in the right order in all + * btree leaves. */ -STATIC int /* error */ -xfs_bmap_alloc( - xfs_bmalloca_t *ap); /* bmap alloc argument struct */ + +STATIC void +xfs_bmap_check_leaf_extents( + xfs_btree_cur_t *cur, /* btree cursor or null */ + xfs_inode_t *ip, /* incore inode pointer */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ + int error; /* error return value */ + xfs_extnum_t i=0, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + xfs_bmbt_rec_t *ep; /* pointer to current extent */ + xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */ + xfs_bmbt_rec_t *nextp; /* pointer to next extent */ + int bp_release = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { + return; + } + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + block = ifp->if_broot; + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + xfs_check_block(block, mp, 1, ifp->if_broot_bytes); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + /* + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. + */ + while (level-- > 0) { + /* See if buf is in cur first */ + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + + /* + * Check this block for basic sanity (increasing keys and + * no duplicate blocks). + */ + + xfs_check_block(block, mp, 0, 0); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + } + + /* + * Here with bp and block set to the leftmost leaf node in the tree. + */ + i = 0; + + /* + * Loop over all leaf nodes checking that all extents are in the right order. + */ + for (;;) { + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + + + num_recs = xfs_btree_get_numrecs(block); + + /* + * Read-ahead the next leaf block, if any. + */ + + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + + /* + * Check all the extents to make sure they are OK. + * If we had a previous block, the last entry should + * conform with the first entry in this one. + */ + + ep = XFS_BMBT_REC_ADDR(mp, block, 1); + if (i) { + ASSERT(xfs_bmbt_disk_get_startoff(&last) + + xfs_bmbt_disk_get_blockcount(&last) <= + xfs_bmbt_disk_get_startoff(ep)); + } + for (j = 1; j < num_recs; j++) { + nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1); + ASSERT(xfs_bmbt_disk_get_startoff(ep) + + xfs_bmbt_disk_get_blockcount(ep) <= + xfs_bmbt_disk_get_startoff(nextp)); + ep = nextp; + } + + last = *ep; + i += num_recs; + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + bno = nextbno; + /* + * If we've reached the end, stop. + */ + if (bno == NULLFSBLOCK) + break; + + bp_release = 0; + bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); + if (!bp) { + bp_release = 1; + error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + goto error_norelse; + } + block = XFS_BUF_TO_BLOCK(bp); + } + if (bp_release) { + bp_release = 0; + xfs_trans_brelse(NULL, bp); + } + return; + +error0: + xfs_warn(mp, "%s: at error0", __func__); + if (bp_release) + xfs_trans_brelse(NULL, bp); +error_norelse: + xfs_warn(mp, "%s: BAD after btree leaves for %d extents", + __func__, i); + panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + return; +} /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Add bmap trace insert entries for all the contents of the extent records. */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ +void +xfs_bmap_trace_exlist( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t cnt, /* count of entries in the list */ + int whichfork, /* data or attr fork */ + unsigned long caller_ip) +{ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int state = 0; + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + for (idx = 0; idx < cnt; idx++) + trace_xfs_extlist(ip, idx, whichfork, caller_ip); +} -#ifdef DEBUG /* - * Check that the extents list for the inode ip is in the right order. + * Validate that the bmbt_irecs being returned from bmapi are valid + * given the caller's original parameters. Specifically check the + * ranges of the returned irecs to ensure that they only extend beyond + * the given parameters if the XFS_BMAPI_ENTIRE flag was set. */ STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork); /* data or attr fork */ -#endif +xfs_bmap_validate_ret( + xfs_fileoff_t bno, + xfs_filblks_t len, + int flags, + xfs_bmbt_irec_t *mval, + int nmap, + int ret_nmap) +{ + int i; /* index to map values */ + + ASSERT(ret_nmap <= nmap); + + for (i = 0; i < ret_nmap; i++) { + ASSERT(mval[i].br_blockcount > 0); + if (!(flags & XFS_BMAPI_ENTIRE)) { + ASSERT(mval[i].br_startoff >= bno); + ASSERT(mval[i].br_blockcount <= len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= + bno + len); + } else { + ASSERT(mval[i].br_startoff < bno + len); + ASSERT(mval[i].br_startoff + mval[i].br_blockcount > + bno); + } + ASSERT(i == 0 || + mval[i - 1].br_startoff + mval[i - 1].br_blockcount == + mval[i].br_startoff); + ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && + mval[i].br_startblock != HOLESTARTBLOCK); + ASSERT(mval[i].br_state == XFS_EXT_NORM || + mval[i].br_state == XFS_EXT_UNWRITTEN); + } +} + +#else +#define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#endif /* DEBUG */ /* - * Called by xfs_bmapi to update file extent records and the btree - * after removing space (or undoing a delayed allocation). + * bmap free list manipulation functions */ -STATIC int /* error */ -xfs_bmap_del_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_trans_t *tp, /* current trans pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp,/* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork, /* data or attr fork */ - int rsvd); /* OK to allocate reserved blocks */ + +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +xfs_bmap_add_free( + xfs_fsblock_t bno, /* fs block number of extent */ + xfs_filblks_t len, /* length of extent */ + xfs_bmap_free_t *flist, /* list of extents */ + xfs_mount_t *mp) /* mount point structure */ +{ + xfs_bmap_free_item_t *cur; /* current (next) element */ + xfs_bmap_free_item_t *new; /* new element */ + xfs_bmap_free_item_t *prev; /* previous element */ +#ifdef DEBUG + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_bmap_free_item_zone != NULL); + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xbfi_startblock = bno; + new->xbfi_blockcount = (xfs_extlen_t)len; + for (prev = NULL, cur = flist->xbf_first; + cur != NULL; + prev = cur, cur = cur->xbfi_next) { + if (cur->xbfi_startblock >= bno) + break; + } + if (prev) + prev->xbfi_next = new; + else + flist->xbf_first = new; + new->xbfi_next = cur; + flist->xbf_count++; +} /* * Remove the entry "free" from the free item list. Prev points to the * previous entry, unless "free" is the head of the list. */ -STATIC void +void xfs_bmap_del_free( xfs_bmap_free_t *flist, /* free item list header */ xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free); /* list item to be freed */ + xfs_bmap_free_item_t *free) /* list item to be freed */ +{ + if (prev) + prev->xbfi_next = free->xbfi_next; + else + flist->xbf_first = free->xbfi_next; + flist->xbf_count--; + kmem_zone_free(xfs_bmap_free_item_zone, free); +} + +/* + * Free up any items left in the list. + */ +void +xfs_bmap_cancel( + xfs_bmap_free_t *flist) /* list of bmap_free_items */ +{ + xfs_bmap_free_item_t *free; /* free list item */ + xfs_bmap_free_item_t *next; + + if (flist->xbf_count == 0) + return; + ASSERT(flist->xbf_first != NULL); + for (free = flist->xbf_first; free; free = next) { + next = free->xbfi_next; + xfs_bmap_del_free(flist, NULL, free); + } + ASSERT(flist->xbf_count == 0); +} + +/* + * Inode fork format manipulation functions + */ + +/* + * Transform a btree format file with only one leaf node, where the + * extents list will fit in the inode, into an extents format file. + * Since the file extents are already in-core, all we have to do is + * give up the space for the btree root and pitch the leaf block. + */ +STATIC int /* error */ +xfs_bmap_btree_to_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode pointer */ + xfs_btree_cur_t *cur, /* btree cursor */ + int *logflagsp, /* inode logging flags */ + int whichfork) /* data or attr fork */ +{ + /* REFERENCED */ + struct xfs_btree_block *cblock;/* child btree block */ + xfs_fsblock_t cbno; /* child block number */ + xfs_buf_t *cbp; /* child block's buffer */ + int error; /* error return value */ + xfs_ifork_t *ifp; /* inode fork data */ + xfs_mount_t *mp; /* mount point structure */ + __be64 *pp; /* ptr to block address */ + struct xfs_btree_block *rblock;/* root btree block */ + + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); + rblock = ifp->if_broot; + ASSERT(be16_to_cpu(rblock->bb_level) == 1); + ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); + ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes); + cbno = be64_to_cpu(*pp); + *logflagsp = 0; +#ifdef DEBUG + if ((error = xfs_btree_check_lptr(cur, cbno, 1))) + return error; +#endif + error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + cblock = XFS_BUF_TO_BLOCK(cbp); + if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) + return error; + xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, cbp); + if (cur->bc_bufs[0] == cbp) + cur->bc_bufs[0] = NULL; + xfs_iroot_realloc(ip, -1, whichfork); + ASSERT(ifp->if_broot == NULL); + ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); + return 0; +} /* * Convert an extents-format file into a btree-format file. @@ -235,190 +752,255 @@ xfs_bmap_extents_to_btree( xfs_btree_cur_t **curp, /* cursor returned to caller */ int wasdel, /* converting a delayed alloc */ int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ + int whichfork) /* data or attr fork */ +{ + struct xfs_btree_block *ablock; /* allocated (child) bt block */ + xfs_buf_t *abp; /* buffer for ablock */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_bmbt_rec_t *arp; /* child record pointer */ + struct xfs_btree_block *block; /* btree root block */ + xfs_btree_cur_t *cur; /* bmap btree cursor */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + int error; /* error return value */ + xfs_extnum_t i, cnt; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_key_t *kp; /* root block key pointer */ + xfs_mount_t *mp; /* mount structure */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_bmbt_ptr_t *pp; /* root block address pointer */ -/* - * Convert a local file to an extents file. - * This code is sort of bogus, since the file data needs to get - * logged so it won't be lost. The bmap-level manipulations are ok, though. - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork); /* data or attr fork */ + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp); /* out: previous extent entry found */ + /* + * Make space in the inode incore. + */ + xfs_iroot_realloc(ip, 1, whichfork); + ifp->if_flags |= XFS_IFBROOT; -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof); /* return value */ + /* + * Fill in the root. + */ + block = ifp->if_broot; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 1, 1, ip->i_ino, + XFS_BTREE_LONG_PTRS); -#ifdef XFS_BMAP_TRACE -/* - * Add a bmap trace buffer entry. Base routine for the others. - */ -STATIC void -xfs_bmap_trace_addentry( - int opcode, /* operation */ - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(ies) */ - xfs_extnum_t cnt, /* count of entries, 1 or 2 */ - xfs_bmbt_rec_t *r1, /* first record */ - xfs_bmbt_rec_t *r2, /* second record or null */ - int whichfork); /* data or attr fork */ + /* + * Need a cursor. Can't allocate until bb_level is filled in. + */ + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + /* + * Convert to a btree with two levels, one record in root. + */ + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = mp; + args.firstblock = *firstblock; + if (*firstblock == NULLFSBLOCK) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); + } else if (flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + args.fsbno = *firstblock; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.fsbno = *firstblock; + } + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = wasdel; + *logflagsp = 0; + if ((error = xfs_alloc_vextent(&args))) { + xfs_iroot_realloc(ip, -1, whichfork); + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + /* + * Allocation can't fail, the space was reserved. + */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(*firstblock == NULLFSBLOCK || + args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || + (flist->xbf_low && + args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); + *firstblock = cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + ip->i_d.di_nblocks++; + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); + abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); + /* + * Fill in the child block. + */ + abp->b_ops = &xfs_bmbt_buf_ops; + ablock = XFS_BUF_TO_BLOCK(abp); + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, ablock, abp->b_bn, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); -/* - * Add bmap trace entry prior to a call to xfs_iext_remove. - */ -STATIC void -xfs_bmap_trace_delete( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) deleted */ - xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ - int whichfork); /* data or attr fork */ + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (cnt = i = 0; i < nextents; i++) { + ep = xfs_iext_get_ext(ifp, i); + if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { + arp->l0 = cpu_to_be64(ep->l0); + arp->l1 = cpu_to_be64(ep->l1); + arp++; cnt++; + } + } + ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); + xfs_btree_set_numrecs(ablock, cnt); -/* - * Add bmap trace entry prior to a call to xfs_iext_insert, or - * reading in the extents list from the disk (in the btree). - */ -STATIC void -xfs_bmap_trace_insert( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) inserted */ - xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ - xfs_bmbt_irec_t *r1, /* inserted record 1 */ - xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ - int whichfork); /* data or attr fork */ + /* + * Fill in the root key and pointer. + */ + kp = XFS_BMBT_KEY_ADDR(mp, block, 1); + arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); + kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp)); + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur, + be16_to_cpu(block->bb_level))); + *pp = cpu_to_be64(args.fsbno); -/* - * Add bmap trace entry after updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_post_update( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry updated */ - int whichfork); /* data or attr fork */ + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS); + xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); + ASSERT(*curp == NULL); + *curp = cur; + *logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork); + return 0; +} /* - * Add bmap trace entry prior to updating an extent record in place. + * Convert a local file to an extents file. + * This code is out of bounds for data forks of regular files, + * since the file data needs to get logged so things will stay consistent. + * (The bmap-level manipulations are ok, though). */ -STATIC void -xfs_bmap_trace_pre_update( - char *fname, /* function name */ - char *desc, /* operation description */ +void +xfs_bmap_local_to_extents_empty( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + ASSERT(ifp->if_bytes == 0); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); + + xfs_bmap_forkoff_reset(ip, whichfork); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); +} + + +STATIC int /* error */ +xfs_bmap_local_to_extents( + xfs_trans_t *tp, /* transaction pointer */ xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry to be updated */ - int whichfork); /* data or attr fork */ + xfs_fsblock_t *firstblock, /* first block allocated in xaction */ + xfs_extlen_t total, /* total blocks needed by transaction */ + int *logflagsp, /* inode logging flags */ + int whichfork, + void (*init_fn)(struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp)) +{ + int error = 0; + int flags; /* logging flags returned */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_alloc_arg_t args; /* allocation arguments */ + xfs_buf_t *bp; /* buffer for extent block */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ -#else -#define xfs_bmap_trace_delete(f,d,ip,i,c,w) -#define xfs_bmap_trace_insert(f,d,ip,i,c,r1,r2,w) -#define xfs_bmap_trace_post_update(f,d,ip,i,w) -#define xfs_bmap_trace_pre_update(f,d,ip,i,w) -#endif /* XFS_BMAP_TRACE */ + /* + * We don't want to deal with the case of keeping inode data inline yet. + * So sending the data fork of a regular inode is invalid. + */ + ASSERT(!(S_ISREG(ip->i_d.di_mode) && whichfork == XFS_DATA_FORK)); + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len); /* delayed extent length */ + if (!ifp->if_bytes) { + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags = XFS_ILOG_CORE; + goto done; + } -#ifdef DEBUG -/* - * Perform various validation checks on the values being returned - * from xfs_bmapi(). - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap); -#else -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) -#endif /* DEBUG */ + flags = 0; + error = 0; + ASSERT((ifp->if_flags & (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == + XFS_IFINLINE); + memset(&args, 0, sizeof(args)); + args.tp = tp; + args.mp = ip->i_mount; + args.firstblock = *firstblock; + /* + * Allocate a block. We know we need only one, since the + * file currently fits in an inode. + */ + if (*firstblock == NULLFSBLOCK) { + args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.fsbno = *firstblock; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + args.total = total; + args.minlen = args.maxlen = args.prod = 1; + error = xfs_alloc_vextent(&args); + if (error) + goto done; -#if defined(XFS_RW_TRACE) -STATIC void -xfs_bunmap_trace( - xfs_inode_t *ip, - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - inst_t *ra); -#else -#define xfs_bunmap_trace(ip, bno, len, flags, ra) -#endif /* XFS_RW_TRACE */ + /* Can't fail, the space was reserved. */ + ASSERT(args.fsbno != NULLFSBLOCK); + ASSERT(args.len == 1); + *firstblock = args.fsbno; + bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); -STATIC int -xfs_bmap_count_tree( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ifork_t *ifp, - xfs_fsblock_t blockno, - int levelin, - int *count); + /* initialise the block and copy the data */ + init_fn(tp, bp, ip, ifp); -STATIC int -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count); + /* account for the change in fork size and log everything */ + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); + xfs_bmap_local_to_extents_empty(ip, whichfork); + flags |= XFS_ILOG_CORE; -STATIC int -xfs_bmap_disk_count_leaves( - xfs_ifork_t *ifp, - xfs_mount_t *mp, - xfs_extnum_t idx, - xfs_bmbt_block_t *block, - int numrecs, - int *count); + xfs_iext_add(ifp, 0, 1); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + trace_xfs_bmap_post_update(ip, 0, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, + _THIS_IP_); + XFS_IFORK_NEXT_SET(ip, whichfork, 1); + ip->i_d.di_nblocks = 1; + xfs_trans_mod_dquot_byino(tp, ip, + XFS_TRANS_DQ_BCOUNT, 1L); + flags |= xfs_ilog_fext(whichfork); -/* - * Bmap internal routines. - */ +done: + *logflagsp = flags; + return error; +} /* * Called from xfs_bmap_add_attrfork to handle btree format files. @@ -440,14 +1022,14 @@ xfs_bmap_add_attrfork_btree( if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip)) *flags |= XFS_ILOG_DBROOT; else { - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - XFS_DATA_FORK); + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); cur->bc_private.b.flist = flist; cur->bc_private.b.firstblock = *firstblock; if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; - ASSERT(stat == 1); /* must be at least one entry */ - if ((error = xfs_bmbt_newroot(cur, flags, &stat))) + /* must be at least one entry */ + XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); @@ -491,7 +1073,15 @@ xfs_bmap_add_attrfork_extents( } /* - * Called from xfs_bmap_add_attrfork to handle local format files. + * Called from xfs_bmap_add_attrfork to handle local format files. Each + * different data fork content type needs a different callout to do the + * conversion. Some are basic and only require special block initialisation + * callouts for the data formating, others (directories) are so specialised they + * handle everything themselves. + * + * XXX (dgc): investigate whether directory conversion can use the generic + * formatting callout. It should be possible - it's just a very complex + * formatter. */ STATIC int /* error */ xfs_bmap_add_attrfork_local( @@ -502,230 +1092,662 @@ xfs_bmap_add_attrfork_local( int *flags) /* inode logging flags */ { xfs_da_args_t dargs; /* args for dir/attr code */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount structure pointer */ if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - mp = ip->i_mount; + + if (S_ISDIR(ip->i_d.di_mode)) { memset(&dargs, 0, sizeof(dargs)); + dargs.geo = ip->i_mount->m_dir_geo; dargs.dp = ip; dargs.firstblock = firstblock; dargs.flist = flist; - dargs.total = mp->m_dirblkfsbs; + dargs.total = dargs.geo->fsbcount; dargs.whichfork = XFS_DATA_FORK; dargs.trans = tp; - error = xfs_dir2_sf_to_block(&dargs); - } else - error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags, - XFS_DATA_FORK); + return xfs_dir2_sf_to_block(&dargs); + } + + if (S_ISLNK(ip->i_d.di_mode)) + return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, + flags, XFS_DATA_FORK, + xfs_symlink_local_to_remote); + + /* should only be called for types that support local format data */ + ASSERT(0); + return EFSCORRUPTED; +} + +/* + * Convert inode from non-attributed to attributed. + * Must not be in a transaction, ip must not be locked. + */ +int /* error code */ +xfs_bmap_add_attrfork( + xfs_inode_t *ip, /* incore inode pointer */ + int size, /* space new attribute needs */ + int rsvd) /* xact may use reserved blks */ +{ + xfs_fsblock_t firstblock; /* 1st block/ag allocated */ + xfs_bmap_free_t flist; /* freed extent records */ + xfs_mount_t *mp; /* mount structure */ + xfs_trans_t *tp; /* transaction pointer */ + int blks; /* space reservation */ + int version = 1; /* superblock attr version */ + int committed; /* xaction was committed */ + int logflags; /* logging flags */ + int error; /* error return value */ + int cancel_flags = 0; + + ASSERT(XFS_IFORK_Q(ip) == 0); + + mp = ip->i_mount; + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); + blks = XFS_ADDAFORK_SPACE_RES(mp); + if (rsvd) + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? + XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : + XFS_QMOPT_RES_REGBLKS); + if (error) + goto trans_cancel; + cancel_flags |= XFS_TRANS_ABORT; + if (XFS_IFORK_Q(ip)) + goto trans_cancel; + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { + /* + * For inodes coming from pre-6.2 filesystems. + */ + ASSERT(ip->i_d.di_aformat == 0); + ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; + } + ASSERT(ip->i_d.di_anextents == 0); + + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_DEV: + ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; + break; + case XFS_DINODE_FMT_UUID: + ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; + break; + case XFS_DINODE_FMT_LOCAL: + case XFS_DINODE_FMT_EXTENTS: + case XFS_DINODE_FMT_BTREE: + ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); + if (!ip->i_d.di_forkoff) + ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3; + else if (mp->m_flags & XFS_MOUNT_ATTR2) + version = 2; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + goto trans_cancel; + } + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); + ip->i_afp->if_flags = XFS_IFEXTENTS; + logflags = 0; + xfs_bmap_init(&flist, &firstblock); + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_LOCAL: + error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, + &logflags); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, + &flist, &logflags); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, + &logflags); + break; + default: + error = 0; + break; + } + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (error) + goto bmap_cancel; + if (!xfs_sb_version_hasattr(&mp->m_sb) || + (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + __int64_t sbfields = 0; + + spin_lock(&mp->m_sb_lock); + if (!xfs_sb_version_hasattr(&mp->m_sb)) { + xfs_sb_version_addattr(&mp->m_sb); + sbfields |= XFS_SB_VERSIONNUM; + } + if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { + xfs_sb_version_addattr2(&mp->m_sb); + sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); + } + if (sbfields) { + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + } else + spin_unlock(&mp->m_sb_lock); + } + + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +bmap_cancel: + xfs_bmap_cancel(&flist); +trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* - * Called by xfs_bmapi to update file extent records and the btree - * after allocating space (or doing a delayed allocation). + * Internal and external extent tree search functions. */ -STATIC int /* error */ -xfs_bmap_add_extent( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork, /* data or attr fork */ - int rsvd) /* OK to use reserved data blocks */ + +/* + * Read in the extents to if_extents. + * All inode fields are set up by caller, we just traverse the btree + * and copy the records in. If the file system cannot contain unwritten + * extents, the records are checked for no "state" flags. + */ +int /* error */ +xfs_bmap_read_extents( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ { - xfs_btree_cur_t *cur; /* btree cursor or null */ - xfs_filblks_t da_new; /* new count del alloc blocks used */ - xfs_filblks_t da_old; /* old count del alloc blocks used */ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent"; -#endif - xfs_ifork_t *ifp; /* inode fork ptr */ - int logflags; /* returned value */ - xfs_extnum_t nextents; /* number of extents in file now */ + xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ + xfs_extnum_t i, j; /* index into the extents list */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + /* REFERENCED */ + xfs_extnum_t room; /* number of entries there's room for */ - XFS_STATS_INC(xs_add_exlist); - cur = *curp; + bno = NULLFSBLOCK; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(idx <= nextents); - da_old = da_new = 0; - error = 0; + exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : + XFS_EXTFMT_INODE(ip); + block = ifp->if_broot; /* - * This is the first extent added to a new/empty file. - * Special case this one, so other routines get to assume there are - * already extents in the list. + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. */ - if (nextents == 0) { - xfs_bmap_trace_insert(fname, "insert empty", ip, 0, 1, new, - NULL, whichfork); - xfs_iext_insert(ifp, 0, 1, new); - ASSERT(cur == NULL); - ifp->if_lastex = 0; - if (!ISNULLSTARTBLOCK(new->br_startblock)) { - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - logflags = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); - } else - logflags = 0; - /* DELTA: single new extent */ - if (delta) { - if (delta->xed_startoff > new->br_startoff) - delta->xed_startoff = new->br_startoff; - if (delta->xed_blockcount < - new->br_startoff + new->br_blockcount) - delta->xed_blockcount = new->br_startoff + - new->br_blockcount; - } - } + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); /* - * Any kind of new delayed allocation goes here. + * Go down the tree until leaf level is reached, following the first + * pointer (leftmost) at each level. */ - else if (ISNULLSTARTBLOCK(new->br_startblock)) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_delay(ip, idx, cur, new, - &logflags, delta, rsvd))) - goto done; + while (level-- > 0) { + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, level), + error0); + if (level == 0) + break; + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + xfs_trans_brelse(tp, bp); } /* - * Real allocation off the end of the file. + * Here with bp and block set to the leftmost leaf node in the tree. + */ + room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + i = 0; + /* + * Loop over all leaf nodes. Copy information to the extent records. */ - else if (idx == nextents) { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, new, - &logflags, delta, whichfork))) - goto done; - } else { - xfs_bmbt_irec_t prev; /* old extent at offset idx */ + for (;;) { + xfs_bmbt_rec_t *frp; + xfs_fsblock_t nextbno; + xfs_extnum_t num_recs; + xfs_extnum_t start; + num_recs = xfs_btree_get_numrecs(block); + if (unlikely(i + num_recs > room)) { + ASSERT(i + num_recs <= room); + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, (btree extents).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)", + XFS_ERRLEVEL_LOW, ip->i_mount, block); + goto error0; + } + XFS_WANT_CORRUPTED_GOTO( + xfs_bmap_sanity_check(mp, bp, 0), + error0); /* - * Get the record referred to by idx. + * Read-ahead the next leaf block, if any. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &prev); + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + if (nextbno != NULLFSBLOCK) + xfs_btree_reada_bufl(mp, nextbno, 1, + &xfs_bmbt_buf_ops); /* - * If it's a real allocation record, and the new allocation ends - * after the start of the referred to record, then we're filling - * in a delayed or unwritten allocation with a real one, or - * converting real back to unwritten. + * Copy records into the extent records. */ - if (!ISNULLSTARTBLOCK(new->br_startblock) && - new->br_startoff + new->br_blockcount > prev.br_startoff) { - if (prev.br_state != XFS_EXT_UNWRITTEN && - ISNULLSTARTBLOCK(prev.br_startblock)) { - da_old = STARTBLOCKVAL(prev.br_startblock); - if (cur) - ASSERT(cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL); - if ((error = xfs_bmap_add_extent_delay_real(ip, - idx, &cur, new, &da_new, first, flist, - &logflags, delta, rsvd))) - goto done; - } else if (new->br_state == XFS_EXT_NORM) { - ASSERT(new->br_state == XFS_EXT_NORM); - if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) - goto done; - } else { - ASSERT(new->br_state == XFS_EXT_UNWRITTEN); - if ((error = xfs_bmap_add_extent_unwritten_real( - ip, idx, &cur, new, &logflags, delta))) - goto done; + frp = XFS_BMBT_REC_ADDR(mp, block, 1); + start = i; + for (j = 0; j < num_recs; j++, i++, frp++) { + xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); + trp->l0 = be64_to_cpu(frp->l0); + trp->l1 = be64_to_cpu(frp->l1); + } + if (exntf == XFS_EXTFMT_NOSTATE) { + /* + * Check all attribute bmap btree records and + * any "older" data bmap btree records for a + * set bit in the "extent flag" position. + */ + if (unlikely(xfs_check_nostate_extents(ifp, + start, num_recs))) { + XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + goto error0; } - ASSERT(*curp == cur || *curp == NULL); } + xfs_trans_brelse(tp, bp); + bno = nextbno; /* - * Otherwise we're filling in a hole with an allocation. + * If we've reached the end, stop. */ - else { - if (cur) - ASSERT((cur->bc_private.b.flags & - XFS_BTCUR_BPRV_WASDEL) == 0); - if ((error = xfs_bmap_add_extent_hole_real(ip, idx, cur, - new, &logflags, delta, whichfork))) - goto done; - } + if (bno == NULLFSBLOCK) + break; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops); + if (error) + return error; + block = XFS_BUF_TO_BLOCK(bp); } + ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); + return 0; +error0: + xfs_trans_brelse(tp, bp); + return XFS_ERROR(EFSCORRUPTED); +} + + +/* + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_multi_extents( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t lastx; /* last extent index */ - ASSERT(*curp == cur || *curp == NULL); /* - * Convert to a btree if necessary. + * Initialize the extent entry structure to catch access to + * uninitialized br_startblock field. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { - int tmp_logflags; /* partial log flag return val */ + gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; + gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; + gotp->br_state = XFS_EXT_INVALID; +#if XFS_BIG_BLKNOS + gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; +#else + gotp->br_startblock = 0xffffa5a5; +#endif + prevp->br_startoff = NULLFILEOFF; - ASSERT(cur == NULL); - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, first, - flist, &cur, da_old > 0, &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto done; + ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); + if (lastx > 0) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); + } + if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { + xfs_bmbt_get_all(ep, gotp); + *eofp = 0; + } else { + if (lastx > 0) { + *gotp = *prevp; + } + *eofp = 1; + ep = NULL; + } + *lastxp = lastx; + return ep; +} + +/* + * Search the extents list for the inode, for the extent containing bno. + * If bno lies in a hole, point to the next entry. If bno lies past eof, + * *eofp will be set, and *prevp will contain the last entry (null if none). + * Else, *lastxp will be set to the index of the found + * entry; *gotp will contain the entry. + */ +STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ +xfs_bmap_search_extents( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_fileoff_t bno, /* block number searched for */ + int fork, /* data or attr fork */ + int *eofp, /* out: end of file found */ + xfs_extnum_t *lastxp, /* out: last extent index */ + xfs_bmbt_irec_t *gotp, /* out: extent entry found */ + xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ +{ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + + XFS_STATS_INC(xs_look_exlist); + ifp = XFS_IFORK_PTR(ip, fork); + + ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); + + if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && + !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x lastx: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)gotp->br_startblock, + (unsigned long long)gotp->br_startoff, + (unsigned long long)gotp->br_blockcount, + gotp->br_state, *lastxp); + *lastxp = NULLEXTNUM; + *eofp = 1; + return NULL; + } + return ep; +} + +/* + * Returns the file-relative block number of the first unused block(s) + * in the file with at least "len" logically contiguous blocks free. + * This is the lowest-address hole if the file has holes, else the first block + * past the end of file. + * Return 0 if the file is currently local (in-inode). + */ +int /* error */ +xfs_bmap_first_unused( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_extlen_t len, /* size of hole to find */ + xfs_fileoff_t *first_unused, /* unused block */ + int whichfork) /* data or attr fork */ +{ + int error; /* error return value */ + int idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_fileoff_t lastaddr; /* last block number seen */ + xfs_fileoff_t lowest; /* lowest useful block */ + xfs_fileoff_t max; /* starting useful block */ + xfs_fileoff_t off; /* offset for this block */ + xfs_extnum_t nextents; /* number of extent entries */ + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *first_unused = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + lowest = *first_unused; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); + off = xfs_bmbt_get_startoff(ep); + /* + * See if the hole before this extent will work. + */ + if (off >= lowest + len && off - max >= len) { + *first_unused = max; + return 0; + } + lastaddr = off + xfs_bmbt_get_blockcount(ep); + max = XFS_FILEOFF_MAX(lastaddr, lowest); + } + *first_unused = max; + return 0; +} + +/* + * Returns the file-relative block number of the last block - 1 before + * last_block (input value) in the file. + * This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int /* error */ +xfs_bmap_last_before( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ +{ + xfs_fileoff_t bno; /* input file offset */ + int eof; /* hit end of file */ + xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ + int error; /* error return value */ + xfs_bmbt_irec_t got; /* current extent value */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent used */ + xfs_bmbt_irec_t prev; /* previous extent value */ + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EIO); + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + *last_block = 0; + return 0; + } + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS) && + (error = xfs_iread_extents(tp, ip, whichfork))) + return error; + bno = *last_block - 1; + ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, + &prev); + if (eof || xfs_bmbt_get_startoff(ep) > bno) { + if (prev.br_startoff == NULLFILEOFF) + *last_block = 0; + else + *last_block = prev.br_startoff + prev.br_blockcount; } /* - * Adjust for changes in reserved delayed indirect blocks. - * Nothing to do for disk quotas here. + * Otherwise *last_block is already the right answer. */ - if (da_old || da_new) { - xfs_filblks_t nblks; - - nblks = da_new; - if (cur) - nblks += cur->bc_private.b.allocated; - ASSERT(nblks <= da_old); - if (nblks < da_old) - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(da_old - nblks), rsvd); + return 0; +} + +int +xfs_bmap_last_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + struct xfs_bmbt_irec *rec, + int *is_empty) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int error; + int nextents; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *is_empty = 1; + return 0; + } + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, nextents - 1), rec); + *is_empty = 0; + return 0; +} + +/* + * Check the last inode extent to determine whether this allocation will result + * in blocks being allocated at the end of the file. When we allocate new data + * blocks at the end of the file which do not start at the previous data block, + * we will try to align the new blocks at stripe unit boundaries. + * + * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be + * at, or past the EOF. + */ +STATIC int +xfs_bmap_isaeof( + struct xfs_bmalloca *bma, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + bma->aeof = 0; + error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec, + &is_empty); + if (error) + return error; + + if (is_empty) { + bma->aeof = 1; + return 0; } + /* - * Clear out the allocated field, done with it now in any case. + * Check if we are allocation or past the last extent, or at least into + * the last delayed allocated extent. */ - if (cur) { - cur->bc_private.b.allocated = 0; - *curp = cur; - } -done: -#ifdef DEBUG - if (!error) - xfs_bmap_check_leaf_extents(*curp, ip, whichfork); -#endif - *logflagsp = logflags; - return error; + bma->aeof = bma->offset >= rec.br_startoff + rec.br_blockcount || + (bma->offset >= rec.br_startoff && + isnullstartblock(rec.br_startblock)); + return 0; +} + +/* + * Returns the file-relative block number of the first block past eof in + * the file. This is not based on i_size, it is based on the extent records. + * Returns 0 for local files, as they do not have extent records. + */ +int +xfs_bmap_last_offset( + struct xfs_inode *ip, + xfs_fileoff_t *last_block, + int whichfork) +{ + struct xfs_bmbt_irec rec; + int is_empty; + int error; + + *last_block = 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) + return 0; + + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return XFS_ERROR(EIO); + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); + if (error || is_empty) + return error; + + *last_block = rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Returns whether the selected fork of the inode has exactly one + * block or not. For the data fork we check this matches di_size, + * implying the file's range is 0..bsize-1. + */ +int /* 1=>1 block, 0=>otherwise */ +xfs_bmap_one_block( + xfs_inode_t *ip, /* incore inode */ + int whichfork) /* data or attr fork */ +{ + xfs_bmbt_rec_host_t *ep; /* ptr to fork's extent */ + xfs_ifork_t *ifp; /* inode fork pointer */ + int rval; /* return value */ + xfs_bmbt_irec_t s; /* internal version of extent */ + +#ifndef DEBUG + if (whichfork == XFS_DATA_FORK) + return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize; +#endif /* !DEBUG */ + if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) + return 0; + if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ep = xfs_iext_get_ext(ifp, 0); + xfs_bmbt_get_all(ep, &s); + rval = s.br_startoff == 0 && s.br_blockcount == 1; + if (rval && whichfork == XFS_DATA_FORK) + ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize); + return rval; } /* - * Called by xfs_bmap_add_extent to handle cases converting a delayed - * allocation to a real allocation. + * Extent tree manipulation functions used during allocation. + */ + +/* + * Convert a delayed allocation to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_delay_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - xfs_filblks_t *dnew, /* new delayed-alloc indirect blocks */ - xfs_fsblock_t *first, /* pointer to firstblock variable */ - xfs_bmap_free_t *flist, /* list of extents to be freed */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int rsvd) /* OK to use reserved data block allocation */ + struct xfs_bmalloca *bma) { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_bmbt_irec *new = &bma->got; int diff; /* temp value */ - xfs_bmbt_rec_t *ep; /* extent entry for idx */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_delay_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -733,432 +1755,402 @@ xfs_bmap_add_extent_delay_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; /* value for dnew calculations */ - xfs_filblks_t temp2=0;/* value for dnew calculations */ + xfs_filblks_t da_new; /* new count del alloc blocks used */ + xfs_filblks_t da_old; /* old count del alloc blocks used */ + xfs_filblks_t temp=0; /* value for da_new calculations */ + xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_FILLING, RIGHT_FILLING, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; + + ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) -#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE \ - (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) /* * Set up a bunch of variables to make the tests simpler. */ - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, bma->idx); xfs_bmbt_get_all(ep, &PREV); new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + + da_old = startblockval(PREV.br_startblock); + da_new = 0; + /* * Set flags determining what part of the previous delayed allocation * extent is being replaced by a real allocation. */ - STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); - STATE_SET(RIGHT_FILLING, - PREV.br_startoff + PREV.br_blockcount == new_endoff); + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; } - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && - LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == new->br_state && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == new->br_state && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); + + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; } - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new_endoff == RIGHT.br_startoff && - new->br_startblock + new->br_blockcount == - RIGHT.br_startblock && - new->br_state == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && - ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != - MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || - LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)); + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + new->br_state == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + error = 0; /* * Switch out based on the FILLING and CONTIG state bits. */ - switch (SWITCH_STATE) { - - case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + bma->idx--; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 2); - ip->i_df.if_lastex = idx - 1; - ip->i_d.di_nextents--; - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 2, state); + bma->ip->i_d.di_nextents--; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + - RIGHT.br_blockcount, LEFT.br_state))) + RIGHT.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + bma->idx--; + + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + - PREV.br_blockcount, LEFT.br_state))) + PREV.br_blockcount, LEFT.br_state); + if (error) goto done; } - *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in all of a previously delayed allocation extent. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx + 1, 1); - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, PREV.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + - RIGHT.br_blockcount, PREV.br_state))) + RIGHT.br_blockcount, PREV.br_state); + if (error) goto done; } - *dnew = 0; - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; - case MASK2(LEFT_FILLING, RIGHT_FILLING): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Filling in all of a previously delayed allocation extent. * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx; - ip->i_d.di_nextents++; - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 0); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - *dnew = 0; - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; - case MASK2(LEFT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + trace_xfs_bmap_pre_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(bma->ip, bma->idx - 1, state, _THIS_IP_); + temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - ip->i_df.if_lastex = idx - 1; - if (cur == NULL) + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, LEFT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state))) + LEFT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); - *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx--; break; - case MASK(LEFT_FILLING): + case BMAP_LEFT_FILLING: /* * Filling in the first part of a previous delayed allocation. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, new_endoff); temp = PREV.br_blockcount - new->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); - ip->i_df.if_lastex = idx; - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 0); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, + &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, idx + 1); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "LF", ip, idx + 1, - XFS_DATA_FORK); - *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx + 1); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); break; - case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is contiguous with the new allocation. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 1, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx + 1), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, RIGHT.br_state); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx + 1; - if (cur == NULL) + trace_xfs_bmap_post_update(bma->ip, bma->idx + 1, state, _THIS_IP_); + if (bma->cur == NULL) rval = XFS_ILOG_DEXT; else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, RIGHT.br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, RIGHT.br_startoff, RIGHT.br_startblock, - RIGHT.br_blockcount, &i))) + RIGHT.br_blockcount, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, - RIGHT.br_state))) + RIGHT.br_state); + if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock)); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - *dnew = temp; - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; + + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock)); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; break; - case MASK(RIGHT_FILLING): + case BMAP_RIGHT_FILLING: /* * Filling in the last part of a previous delayed allocation. * The right neighbor is not contiguous. */ temp = PREV.br_blockcount - new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 1, new); - ip->i_df.if_lastex = idx + 1; - ip->i_d.di_nextents++; - if (cur == NULL) + xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 0); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, 1, + &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - STARTBLOCKVAL(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); - *dnew = temp; - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; + da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp), + startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock(da_new)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + bma->idx++; break; case 0: @@ -1166,141 +2158,153 @@ xfs_bmap_add_extent_delay_real( * Filling in the middle part of a previous delayed allocation. * Contiguity is impossible here. * This case is avoided almost all the time. + * + * We start with a delayed allocation: + * + * +ddddddddddddddddddddddddddddddddddddddddddddddddddddddd+ + * PREV @ idx + * + * and we are allocating: + * +rrrrrrrrrrrrrrrrr+ + * new + * + * and we set it up for insertion as: + * +ddddddddddddddddddd+rrrrrrrrrrrrrrrrr+ddddddddddddddddd+ + * new + * PREV @ idx LEFT RIGHT + * inserted at idx + 1 */ temp = new->br_startoff - PREV.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); - xfs_bmbt_set_blockcount(ep, temp); - r[0] = *new; - r[1].br_startoff = new_endoff; temp2 = PREV.br_startoff + PREV.br_blockcount - new_endoff; - r[1].br_blockcount = temp2; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 2, &r[0]); - ip->i_df.if_lastex = idx + 1; - ip->i_d.di_nextents++; - if (cur == NULL) + trace_xfs_bmap_pre_update(bma->ip, bma->idx, 0, _THIS_IP_); + xfs_bmbt_set_blockcount(ep, temp); /* truncate PREV */ + LEFT = *new; + RIGHT.br_state = PREV.br_state; + RIGHT.br_startblock = nullstartblock( + (int)xfs_bmap_worst_indlen(bma->ip, temp2)); + RIGHT.br_startoff = new_endoff; + RIGHT.br_blockcount = temp2; + /* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */ + xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state); + bma->ip->i_d.di_nextents++; + if (bma->cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount, - &i))) + &i); + if (error) goto done; - ASSERT(i == 0); - cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_nextents > ip->i_df.if_ext_max) { - error = xfs_bmap_extents_to_btree(ip->i_transp, ip, - first, flist, &cur, 1, &tmp_rval, - XFS_DATA_FORK); + + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 1, &tmp_rval, XFS_DATA_FORK); rval |= tmp_rval; if (error) goto done; } - temp = xfs_bmap_worst_indlen(ip, temp); - temp2 = xfs_bmap_worst_indlen(ip, temp2); - diff = (int)(temp + temp2 - STARTBLOCKVAL(PREV.br_startblock) - - (cur ? cur->bc_private.b.allocated : 0)); - if (diff > 0 && - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, -diff, rsvd)) { - /* - * Ick gross gag me with a spoon. - */ - ASSERT(0); /* want to see if this ever happens! */ - while (diff > 0) { - if (temp) { - temp--; - diff--; - if (!diff || - !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) - break; - } - if (temp2) { - temp2--; - diff--; - if (!diff || - !xfs_mod_incore_sb(ip->i_mount, - XFS_SBS_FDBLOCKS, -diff, rsvd)) - break; - } - } + temp = xfs_bmap_worst_indlen(bma->ip, temp); + temp2 = xfs_bmap_worst_indlen(bma->ip, temp2); + diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - + (bma->cur ? bma->cur->bc_private.b.allocated : 0)); + if (diff > 0) { + error = xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + -((int64_t)diff), 0); + ASSERT(!error); + if (error) + goto done; } - ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx + 2), - NULLSTARTBLOCK((int)temp2)); - xfs_bmap_trace_post_update(fname, "0", ip, idx + 2, - XFS_DATA_FORK); - *dnew = temp + temp2; - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; + + ep = xfs_iext_get_ext(ifp, bma->idx); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_pre_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, bma->idx + 2), + nullstartblock((int)temp2)); + trace_xfs_bmap_post_update(bma->ip, bma->idx + 2, state, _THIS_IP_); + + bma->idx++; + da_new = temp + temp2; break; - case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK2(LEFT_FILLING, RIGHT_CONTIG): - case MASK2(RIGHT_FILLING, LEFT_CONTIG): - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): - case MASK(LEFT_CONTIG): - case MASK(RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ ASSERT(0); } - *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + da_old > 0, &tmp_logflags, XFS_DATA_FORK); + bma->logflags |= tmp_logflags; + if (error) + goto done; } + + /* adjust for changes in reserved delayed indirect blocks */ + if (da_old || da_new) { + temp = da_new; + if (bma->cur) + temp += bma->cur->bc_private.b.allocated; + ASSERT(temp <= da_old); + if (temp < da_old) + xfs_icsb_modify_counters(bma->ip->i_mount, + XFS_SBS_FDBLOCKS, + (int64_t)(da_old - temp), 0); + } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); done: - *logflagsp = rval; + bma->logflags |= rval; return error; #undef LEFT #undef RIGHT #undef PREV -#undef MASK -#undef MASK2 -#undef MASK3 -#undef MASK4 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* - * Called by xfs_bmap_add_extent to handle cases converting an unwritten - * allocation to a real allocation or vice versa. + * Convert an unwritten allocation to a real allocation or vice versa. */ STATIC int /* error */ xfs_bmap_add_extent_unwritten_real( + struct xfs_trans *tp, xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ + xfs_extnum_t *idx, /* extent number to update/insert */ xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta) /* Change made to incore extents */ + xfs_fsblock_t *first, /* pointer to firstblock variable */ + xfs_bmap_free_t *flist, /* list of extents to be freed */ + int *logflagsp) /* inode logging flags */ { xfs_btree_cur_t *cur; /* btree cursor */ - xfs_bmbt_rec_t *ep; /* extent entry for idx */ + xfs_bmbt_rec_host_t *ep; /* extent entry for idx */ int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_unwritten_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_fileoff_t new_endoff; /* end offset of new entry */ @@ -1310,36 +2314,27 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_FILLING, RIGHT_FILLING, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; + + *logflagsp = 0; + + cur = *curp; + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + ASSERT(*idx >= 0); + ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + + XFS_STATS_INC(xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] #define PREV r[2] -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define MASK3(a,b,c) (MASK2(a,b) | MASK(c)) -#define MASK4(a,b,c,d) (MASK3(a,b,c) | MASK(d)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE \ - (state & MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG)) /* * Set up a bunch of variables to make the tests simpler. */ error = 0; - cur = *curp; - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &PREV); newext = new->br_state; oldext = (newext == XFS_EXT_UNWRITTEN) ? @@ -1348,70 +2343,80 @@ xfs_bmap_add_extent_unwritten_real( new_endoff = new->br_startoff + new->br_blockcount; ASSERT(PREV.br_startoff <= new->br_startoff); ASSERT(PREV.br_startoff + PREV.br_blockcount >= new_endoff); + /* * Set flags determining what part of the previous oldext allocation * extent is being replaced by a newext allocation. */ - STATE_SET(LEFT_FILLING, PREV.br_startoff == new->br_startoff); - STATE_SET(RIGHT_FILLING, - PREV.br_startoff + PREV.br_blockcount == new_endoff); + if (PREV.br_startoff == new->br_startoff) + state |= BMAP_LEFT_FILLING; + if (PREV.br_startoff + PREV.br_blockcount == new_endoff) + state |= BMAP_RIGHT_FILLING; + /* * Check and set flags if this segment has a left neighbor. * Don't set contiguous if the combined extent would be too large. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &LEFT); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(LEFT.br_startblock)); + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &LEFT); + + if (isnullstartblock(LEFT.br_startblock)) + state |= BMAP_LEFT_DELAY; } - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && - LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && - LEFT.br_state == newext && - LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN); + + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff && + LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock && + LEFT.br_state == newext && + LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + /* * Check and set flags if this segment has a right neighbor. * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx + 1), &RIGHT); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(RIGHT.br_startblock)); + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); + if (isnullstartblock(RIGHT.br_startblock)) + state |= BMAP_RIGHT_DELAY; } - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new_endoff == RIGHT.br_startoff && - new->br_startblock + new->br_blockcount == - RIGHT.br_startblock && - newext == RIGHT.br_state && - new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && - ((state & MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING)) != - MASK3(LEFT_CONTIG, LEFT_FILLING, RIGHT_FILLING) || - LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount - <= MAXEXTLEN)); + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new_endoff == RIGHT.br_startoff && + new->br_startblock + new->br_blockcount == RIGHT.br_startblock && + newext == RIGHT.br_state && + new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN && + ((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING)) != + (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING | + BMAP_RIGHT_FILLING) || + LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount + <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; + /* * Switch out based on the FILLING and CONTIG state bits. */ - switch (SWITCH_STATE) { - - case MASK4(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | + BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left and right neighbors are both contiguous with new. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LF|RF|LC|RC", ip, idx, 2, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 2); - ip->i_df.if_lastex = idx - 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 2, state); ip->i_d.di_nextents -= 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1421,47 +2426,40 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + RIGHT.br_blockcount, LEFT.br_state))) goto done; } - /* DELTA: Three in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount + - RIGHT.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The left neighbor is contiguous, the right is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), LEFT.br_blockcount + PREV.br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|RF|LC", ip, idx - 1, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx - 1; - xfs_bmap_trace_delete(fname, "LF|RF|LC", ip, idx, 1, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1471,41 +2469,32 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, LEFT.br_state))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting all of a previous oldext extent to newext. * The right neighbor is contiguous, the left is not. */ - xfs_bmap_trace_pre_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount + RIGHT.br_blockcount); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF|RC", ip, idx, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx; - xfs_bmap_trace_delete(fname, "LF|RF|RC", ip, idx + 1, 1, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx + 1, 1); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_remove(ip, *idx + 1, 1, state); ip->i_d.di_nextents--; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1515,37 +2504,31 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_delete(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext))) goto done; } - /* DELTA: Two in-core extents are replaced by one. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; - case MASK2(LEFT_FILLING, RIGHT_FILLING): + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: /* * Setting all of a previous oldext extent to newext. * Neither the left nor right neighbors are contiguous with * the new one. */ - xfs_bmap_trace_pre_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_state(ep, newext); - xfs_bmap_trace_post_update(fname, "LF|RF", ip, idx, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1554,39 +2537,35 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) goto done; } - /* DELTA: The in-core extent described by new changed type. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; - case MASK2(LEFT_FILLING, LEFT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + trace_xfs_bmap_pre_update(ip, *idx - 1, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx - 1), LEFT.br_blockcount + new->br_blockcount); xfs_bmbt_set_startoff(ep, PREV.br_startoff + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, *idx - 1, state, _THIS_IP_); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF|LC", ip, idx, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx - 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + --*idx; + if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1595,44 +2574,39 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, PREV.br_blockcount - new->br_blockcount, oldext))) goto done; - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - if (xfs_bmbt_update(cur, LEFT.br_startoff, + error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + new->br_blockcount, - LEFT.br_state)) + LEFT.br_state); + if (error) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = LEFT.br_startoff; - temp2 = LEFT.br_blockcount + - PREV.br_blockcount; break; - case MASK(LEFT_FILLING): + case BMAP_LEFT_FILLING: /* * Setting the first part of a previous oldext extent to newext. * The left neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "LF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); ASSERT(ep && xfs_bmbt_get_state(ep) == oldext); xfs_bmbt_set_startoff(ep, new_endoff); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); xfs_bmbt_set_startblock(ep, new->br_startblock + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "LF", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); - ip->i_df.if_lastex = idx; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_insert(ip, *idx, 1, new, state); ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1642,7 +2616,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -1650,34 +2624,30 @@ xfs_bmap_add_extent_unwritten_real( oldext))) goto done; cur->bc_rec.b = *new; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; - case MASK2(RIGHT_FILLING, RIGHT_CONTIG): + case BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is contiguous with the new allocation. */ - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - xfs_bmap_trace_pre_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx, - XFS_DATA_FORK); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, idx + 1), + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, newext); - xfs_bmap_trace_post_update(fname, "RF|RC", ip, idx + 1, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx + 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + if (cur == NULL) rval = XFS_ILOG_DEXT; else { @@ -1686,13 +2656,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, oldext))) goto done; - if ((error = xfs_bmbt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto done; if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, @@ -1700,25 +2670,21 @@ xfs_bmap_add_extent_unwritten_real( newext))) goto done; } - /* DELTA: The boundary between two in-core extents moved. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount + - RIGHT.br_blockcount; break; - case MASK(RIGHT_FILLING): + case BMAP_RIGHT_FILLING: /* * Setting the last part of a previous oldext extent to newext. * The right neighbor is not contiguous. */ - xfs_bmap_trace_pre_update(fname, "RF", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, PREV.br_blockcount - new->br_blockcount); - xfs_bmap_trace_post_update(fname, "RF", ip, idx, XFS_DATA_FORK); - xfs_bmap_trace_insert(fname, "RF", ip, idx + 1, 1, - new, NULL, XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 1, new); - ip->i_df.if_lastex = idx + 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + ++*idx; + xfs_iext_insert(ip, *idx, 1, new, state); + ip->i_d.di_nextents++; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1728,7 +2694,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -1738,15 +2704,12 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - ASSERT(i == 0); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; - if ((error = xfs_bmbt_insert(cur, &i))) + if ((error = xfs_btree_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in two. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; case 0: @@ -1755,20 +2718,21 @@ xfs_bmap_add_extent_unwritten_real( * newext. Contiguity is impossible here. * One extent becomes three extents. */ - xfs_bmap_trace_pre_update(fname, "0", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, new->br_startoff - PREV.br_startoff); - xfs_bmap_trace_post_update(fname, "0", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + r[0] = *new; r[1].br_startoff = new_endoff; r[1].br_blockcount = PREV.br_startoff + PREV.br_blockcount - new_endoff; r[1].br_startblock = new->br_startblock + new->br_blockcount; r[1].br_state = oldext; - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 2, &r[0], &r[1], - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx + 1, 2, &r[0]); - ip->i_df.if_lastex = idx + 1; + + ++*idx; + xfs_iext_insert(ip, *idx, 2, &r[0], state); + ip->i_d.di_nextents += 2; if (cur == NULL) rval = XFS_ILOG_CORE | XFS_ILOG_DEXT; @@ -1778,88 +2742,86 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, r[1].br_state))) goto done; /* new left extent - oldext */ - PREV.br_blockcount = - new->br_startoff - PREV.br_startoff; cur->bc_rec.b = PREV; - if ((error = xfs_bmbt_insert(cur, &i))) + cur->bc_rec.b.br_blockcount = + new->br_startoff - PREV.br_startoff; + if ((error = xfs_btree_insert(cur, &i))) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_increment(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + /* + * Reset the cursor to the position of the new extent + * we are about to insert as we can't trust it after + * the previous insert. + */ + if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff, + new->br_startblock, new->br_blockcount, + &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 0, done); /* new middle extent - newext */ - cur->bc_rec.b = *new; - if ((error = xfs_bmbt_insert(cur, &i))) + cur->bc_rec.b.br_state = new->br_state; + if ((error = xfs_btree_insert(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: One in-core extent is split in three. */ - temp = PREV.br_startoff; - temp2 = PREV.br_blockcount; break; - case MASK3(LEFT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK3(RIGHT_FILLING, LEFT_CONTIG, RIGHT_CONTIG): - case MASK2(LEFT_FILLING, RIGHT_CONTIG): - case MASK2(RIGHT_FILLING, LEFT_CONTIG): - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): - case MASK(LEFT_CONTIG): - case MASK(RIGHT_CONTIG): + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_FILLING | BMAP_RIGHT_CONTIG: + case BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + case BMAP_LEFT_CONTIG: + case BMAP_RIGHT_CONTIG: /* * These cases are all impossible. */ ASSERT(0); } - *curp = cur; - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, flist, &cur, + 0, &tmp_logflags, XFS_DATA_FORK); + *logflagsp |= tmp_logflags; + if (error) + goto done; + } + + /* clear out the allocated field, done with it now in any case. */ + if (cur) { + cur->bc_private.b.allocated = 0; + *curp = cur; } + + xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK); done: - *logflagsp = rval; + *logflagsp |= rval; return error; #undef LEFT #undef RIGHT #undef PREV -#undef MASK -#undef MASK2 -#undef MASK3 -#undef MASK4 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a delayed allocation. + * Convert a hole to a delayed allocation. */ -/*ARGSUSED*/ -STATIC int /* error */ +STATIC void xfs_bmap_add_extent_hole_delay( xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int rsvd) /* OK to allocate reserved blocks */ + xfs_extnum_t *idx, /* extent number to update/insert */ + xfs_bmbt_irec_t *new) /* new data to add to file extents */ { - xfs_bmbt_rec_t *ep; /* extent record for idx */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_delay"; -#endif xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_filblks_t newlen=0; /* new indirect size */ @@ -1867,131 +2829,112 @@ xfs_bmap_add_extent_hole_delay( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int state; /* state bits, accessed thru macros */ xfs_filblks_t temp=0; /* temp for indirect calculations */ - xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; - -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - ep = xfs_iext_get_ext(ifp, idx); state = 0; - ASSERT(ISNULLSTARTBLOCK(new->br_startblock)); + ASSERT(isnullstartblock(new->br_startblock)); + /* * Check and set flags if this segment has a left neighbor */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + if (*idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); + + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; } + /* * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); + + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; } + /* * Set contiguity flags on the left and right neighbors. * Don't let extents get too large, even if the pieces are contiguous. */ - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && STATE_TEST(LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN); - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && STATE_TEST(RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && - (!STATE_TEST(LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN))); + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN))) + state |= BMAP_RIGHT_CONTIG; + /* * Switch out based on the contiguity flags. */ - switch (SWITCH_STATE) { - - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with delayed allocations * on the left and on the right. * Merge all three into a single extent record. */ + --*idx; temp = left.br_blockcount + new->br_blockcount + right.br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmap_trace_delete(fname, "LC|RC", ip, idx, 1, - XFS_DATA_FORK); - xfs_iext_remove(ifp, idx, 1); - ip->i_df.if_lastex = idx - 1; - /* DELTA: Two in-core extents were replaced by one. */ - temp2 = temp; - temp = left.br_startoff; + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + + xfs_iext_remove(ip, *idx + 1, 1, state); break; - case MASK(LEFT_CONTIG): + case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the left. * Merge the new allocation with the left neighbor. */ + --*idx; temp = left.br_blockcount + new->br_blockcount; - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, - XFS_DATA_FORK); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), temp); - oldlen = STARTBLOCKVAL(left.br_startblock) + - STARTBLOCKVAL(new->br_startblock); + + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp); + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, idx - 1), - NULLSTARTBLOCK((int)newlen)); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, - XFS_DATA_FORK); - ip->i_df.if_lastex = idx - 1; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = left.br_startoff; + xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx), + nullstartblock((int)newlen)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; - case MASK(RIGHT_CONTIG): + case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a delayed allocation * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, XFS_DATA_FORK); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); temp = new->br_blockcount + right.br_blockcount; - oldlen = STARTBLOCKVAL(new->br_startblock) + - STARTBLOCKVAL(right.br_startblock); + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); newlen = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_allf(ep, new->br_startoff, - NULLSTARTBLOCK((int)newlen), temp, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, XFS_DATA_FORK); - ip->i_df.if_lastex = idx; - /* DELTA: One in-core extent grew into a hole. */ - temp2 = temp; - temp = new->br_startoff; + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), + new->br_startoff, + nullstartblock((int)newlen), temp, right.br_state); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -2001,246 +2944,207 @@ xfs_bmap_add_extent_hole_delay( * Insert a new entry. */ oldlen = newlen = 0; - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, - XFS_DATA_FORK); - xfs_iext_insert(ifp, idx, 1, new); - ip->i_df.if_lastex = idx; - /* DELTA: A new in-core extent was added in a hole. */ - temp2 = new->br_blockcount; - temp = new->br_startoff; + xfs_iext_insert(ip, *idx, 1, new, state); break; } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_mod_incore_sb(ip->i_mount, XFS_SBS_FDBLOCKS, - (int)(oldlen - newlen), rsvd); + xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, + (int64_t)(oldlen - newlen), 0); /* * Nothing to do for disk quota accounting here. */ } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; - } - *logflagsp = 0; - return 0; -#undef MASK -#undef MASK2 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* - * Called by xfs_bmap_add_extent to handle cases converting a hole - * to a real allocation. + * Convert a hole to a real allocation. */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* extent number to update/insert */ - xfs_btree_cur_t *cur, /* if null, not a btree */ - xfs_bmbt_irec_t *new, /* new data to add to file extents */ - int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork) /* data or attr fork */ + struct xfs_bmalloca *bma, + int whichfork) { - xfs_bmbt_rec_t *ep; /* pointer to extent entry ins. point */ + struct xfs_bmbt_irec *new = &bma->got; int error; /* error return value */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_add_extent_hole_real"; -#endif int i; /* temp state */ xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - xfs_filblks_t temp=0; - xfs_filblks_t temp2=0; - enum { /* bit number definitions for state */ - LEFT_CONTIG, RIGHT_CONTIG, - LEFT_DELAY, RIGHT_DELAY, - LEFT_VALID, RIGHT_VALID - }; - -#define MASK(b) (1 << (b)) -#define MASK2(a,b) (MASK(a) | MASK(b)) -#define STATE_SET(b,v) ((v) ? (state |= MASK(b)) : (state &= ~MASK(b))) -#define STATE_TEST(b) (state & MASK(b)) -#define STATE_SET_TEST(b,v) ((v) ? ((state |= MASK(b)), 1) : \ - ((state &= ~MASK(b)), 0)) -#define SWITCH_STATE (state & MASK2(LEFT_CONTIG, RIGHT_CONTIG)) - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(idx <= ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - ep = xfs_iext_get_ext(ifp, idx); + ifp = XFS_IFORK_PTR(bma->ip, whichfork); + + ASSERT(bma->idx >= 0); + ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(!isnullstartblock(new->br_startblock)); + ASSERT(!bma->cur || + !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + + XFS_STATS_INC(xs_add_exlist); + state = 0; + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + /* * Check and set flags if this segment has a left neighbor. */ - if (STATE_SET_TEST(LEFT_VALID, idx > 0)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx - 1), &left); - STATE_SET(LEFT_DELAY, ISNULLSTARTBLOCK(left.br_startblock)); + if (bma->idx > 0) { + state |= BMAP_LEFT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; } + /* * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (STATE_SET_TEST(RIGHT_VALID, - idx < - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, &right); - STATE_SET(RIGHT_DELAY, ISNULLSTARTBLOCK(right.br_startblock)); + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + state |= BMAP_RIGHT_VALID; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; } + /* * We're inserting a real allocation between "left" and "right". * Set the contiguity flags. Don't let extents get too large. */ - STATE_SET(LEFT_CONTIG, - STATE_TEST(LEFT_VALID) && !STATE_TEST(LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_startblock + left.br_blockcount == new->br_startblock && - left.br_state == new->br_state && - left.br_blockcount + new->br_blockcount <= MAXEXTLEN); - STATE_SET(RIGHT_CONTIG, - STATE_TEST(RIGHT_VALID) && !STATE_TEST(RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_startblock + new->br_blockcount == - right.br_startblock && - new->br_state == right.br_state && - new->br_blockcount + right.br_blockcount <= MAXEXTLEN && - (!STATE_TEST(LEFT_CONTIG) || - left.br_blockcount + new->br_blockcount + - right.br_blockcount <= MAXEXTLEN)); + if ((state & BMAP_LEFT_VALID) && !(state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_startblock + left.br_blockcount == new->br_startblock && + left.br_state == new->br_state && + left.br_blockcount + new->br_blockcount <= MAXEXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_startblock + new->br_blockcount == right.br_startblock && + new->br_state == right.br_state && + new->br_blockcount + right.br_blockcount <= MAXEXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + left.br_blockcount + new->br_blockcount + + right.br_blockcount <= MAXEXTLEN)) + state |= BMAP_RIGHT_CONTIG; error = 0; /* * Select which case we're in here, and implement it. */ - switch (SWITCH_STATE) { - - case MASK2(LEFT_CONTIG, RIGHT_CONTIG): + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with real allocations on the * left and on the right. * Merge all three into a single extent record. */ - xfs_bmap_trace_pre_update(fname, "LC|RC", ip, idx - 1, - whichfork); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - xfs_bmap_trace_post_update(fname, "LC|RC", ip, idx - 1, - whichfork); - xfs_bmap_trace_delete(fname, "LC|RC", ip, - idx, 1, whichfork); - xfs_iext_remove(ifp, idx, 1); - ifp->if_lastex = idx - 1; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, - right.br_startoff, - right.br_startblock, - right.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + right.br_startblock, right.br_blockcount, + &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_delete(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_btree_delete(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_decrement(cur, 0, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_btree_decrement(bma->cur, 0, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + right.br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } - /* DELTA: Two in-core extents were replaced by one. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount + - right.br_blockcount; break; - case MASK(LEFT_CONTIG): + case BMAP_LEFT_CONTIG: /* * New allocation is contiguous with a real allocation * on the left. * Merge the new allocation with the left neighbor. */ - xfs_bmap_trace_pre_update(fname, "LC", ip, idx - 1, whichfork); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, idx - 1), + --bma->idx; + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), left.br_blockcount + new->br_blockcount); - xfs_bmap_trace_post_update(fname, "LC", ip, idx - 1, whichfork); - ifp->if_lastex = idx - 1; - if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, - left.br_startoff, - left.br_startblock, - left.br_blockcount, &i))) + error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + left.br_startblock, left.br_blockcount, + &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, left.br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, - left.br_state))) + left.br_state); + if (error) goto done; } - /* DELTA: One in-core extent grew. */ - temp = left.br_startoff; - temp2 = left.br_blockcount + - new->br_blockcount; break; - case MASK(RIGHT_CONTIG): + case BMAP_RIGHT_CONTIG: /* * New allocation is contiguous with a real allocation * on the right. * Merge the new allocation with the right neighbor. */ - xfs_bmap_trace_pre_update(fname, "RC", ip, idx, whichfork); - xfs_bmbt_set_allf(ep, new->br_startoff, new->br_startblock, + trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - xfs_bmap_trace_post_update(fname, "RC", ip, idx, whichfork); - ifp->if_lastex = idx; - if (cur == NULL) { - rval = XFS_ILOG_FEXT(whichfork); + trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + + if (bma->cur == NULL) { + rval = xfs_ilog_fext(whichfork); } else { rval = 0; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, right.br_startblock, - right.br_blockcount, &i))) + right.br_blockcount, &i); + if (error) goto done; - ASSERT(i == 1); - if ((error = xfs_bmbt_update(cur, new->br_startoff, + XFS_WANT_CORRUPTED_GOTO(i == 1, done); + error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, - right.br_state))) + right.br_state); + if (error) goto done; } - /* DELTA: One in-core extent grew. */ - temp = new->br_startoff; - temp2 = new->br_blockcount + - right.br_blockcount; break; case 0: @@ -2249,54 +3153,60 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_bmap_trace_insert(fname, "0", ip, idx, 1, new, NULL, - whichfork); - xfs_iext_insert(ifp, idx, 1, new); - ifp->if_lastex = idx; - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) + 1); - if (cur == NULL) { - rval = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + xfs_iext_insert(bma->ip, bma->idx, 1, new, state); + XFS_IFORK_NEXT_SET(bma->ip, whichfork, + XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); + if (bma->cur == NULL) { + rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - if ((error = xfs_bmbt_lookup_eq(cur, + error = xfs_bmbt_lookup_eq(bma->cur, new->br_startoff, new->br_startblock, - new->br_blockcount, &i))) + new->br_blockcount, &i); + if (error) goto done; - ASSERT(i == 0); - cur->bc_rec.b.br_state = new->br_state; - if ((error = xfs_bmbt_insert(cur, &i))) + XFS_WANT_CORRUPTED_GOTO(i == 0, done); + bma->cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(bma->cur, &i); + if (error) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } - /* DELTA: A new extent was added in a hole. */ - temp = new->br_startoff; - temp2 = new->br_blockcount; break; } - if (delta) { - temp2 += temp; - if (delta->xed_startoff > temp) - delta->xed_startoff = temp; - if (delta->xed_blockcount < temp2) - delta->xed_blockcount = temp2; + + /* convert to a btree if necessary */ + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(bma->cur == NULL); + error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, + bma->firstblock, bma->flist, &bma->cur, + 0, &tmp_logflags, whichfork); + bma->logflags |= tmp_logflags; + if (error) + goto done; } + + /* clear out the allocated field, done with it now in any case. */ + if (bma->cur) + bma->cur->bc_private.b.allocated = 0; + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: - *logflagsp = rval; + bma->logflags |= rval; return error; -#undef MASK -#undef MASK2 -#undef STATE_SET -#undef STATE_TEST -#undef STATE_SET_TEST -#undef SWITCH_STATE } /* + * Functions used in the extent read, allocate and remove paths + */ + +/* * Adjust the size of the new extent based on di_extsize and rt extsize. */ -STATIC int +int xfs_bmap_extsize_align( xfs_mount_t *mp, xfs_bmbt_irec_t *gotp, /* next extent pointer */ @@ -2458,9 +3368,9 @@ xfs_bmap_extsize_align( #define XFS_ALLOC_GAP_UNITS 4 -STATIC int +void xfs_bmap_adjacent( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_fsblock_t adjust; /* adjustment to block numbers */ xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ @@ -2476,26 +3386,26 @@ xfs_bmap_adjacent( XFS_FSB_TO_AGBNO(mp, x) < mp->m_sb.sb_agblocks) mp = ap->ip->i_mount; - nullfb = ap->firstblock == NULLFSBLOCK; + nullfb = *ap->firstblock == NULLFSBLOCK; rt = XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); /* * If allocating at eof, and there's a previous real block, - * try to use it's last block as our starting point. + * try to use its last block as our starting point. */ - if (ap->eof && ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && - ISVALID(ap->prevp->br_startblock + ap->prevp->br_blockcount, - ap->prevp->br_startblock)) { - ap->rval = ap->prevp->br_startblock + ap->prevp->br_blockcount; + if (ap->eof && ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + ISVALID(ap->prev.br_startblock + ap->prev.br_blockcount, + ap->prev.br_startblock)) { + ap->blkno = ap->prev.br_startblock + ap->prev.br_blockcount; /* * Adjust for the gap between prevp and us. */ - adjust = ap->off - - (ap->prevp->br_startoff + ap->prevp->br_blockcount); + adjust = ap->offset - + (ap->prev.br_startoff + ap->prev.br_blockcount); if (adjust && - ISVALID(ap->rval + adjust, ap->prevp->br_startblock)) - ap->rval += adjust; + ISVALID(ap->blkno + adjust, ap->prev.br_startblock)) + ap->blkno += adjust; } /* * If not at eof, then compare the two neighbor blocks. @@ -2512,17 +3422,17 @@ xfs_bmap_adjacent( * If there's a previous (left) block, select a requested * start block based on it. */ - if (ap->prevp->br_startoff != NULLFILEOFF && - !ISNULLSTARTBLOCK(ap->prevp->br_startblock) && - (prevbno = ap->prevp->br_startblock + - ap->prevp->br_blockcount) && - ISVALID(prevbno, ap->prevp->br_startblock)) { + if (ap->prev.br_startoff != NULLFILEOFF && + !isnullstartblock(ap->prev.br_startblock) && + (prevbno = ap->prev.br_startblock + + ap->prev.br_blockcount) && + ISVALID(prevbno, ap->prev.br_startblock)) { /* * Calculate gap to end of previous block. */ - adjust = prevdiff = ap->off - - (ap->prevp->br_startoff + - ap->prevp->br_blockcount); + adjust = prevdiff = ap->offset - + (ap->prev.br_startoff + + ap->prev.br_blockcount); /* * Figure the startblock based on the previous block's * end and the gap size. @@ -2531,9 +3441,9 @@ xfs_bmap_adjacent( * allocating, or using it gives us an invalid block * number, then just use the end of the previous block. */ - if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (prevdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(prevbno + prevdiff, - ap->prevp->br_startblock)) + ap->prev.br_startblock)) prevbno += adjust; else prevdiff += adjust; @@ -2554,16 +3464,16 @@ xfs_bmap_adjacent( * If there's a following (right) block, select a requested * start block based on it. */ - if (!ISNULLSTARTBLOCK(ap->gotp->br_startblock)) { + if (!isnullstartblock(ap->got.br_startblock)) { /* * Calculate gap to start of next block. */ - adjust = gotdiff = ap->gotp->br_startoff - ap->off; + adjust = gotdiff = ap->got.br_startoff - ap->offset; /* * Figure the startblock based on the next block's * start and the gap size. */ - gotbno = ap->gotp->br_startblock; + gotbno = ap->got.br_startblock; /* * Heuristic! * If the gap is large relative to the piece we're @@ -2571,12 +3481,12 @@ xfs_bmap_adjacent( * number, then just use the start of the next block * offset by our length. */ - if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->alen && + if (gotdiff <= XFS_ALLOC_GAP_UNITS * ap->length && ISVALID(gotbno - gotdiff, gotbno)) gotbno -= adjust; - else if (ISVALID(gotbno - ap->alen, gotbno)) { - gotbno -= ap->alen; - gotdiff += adjust - ap->alen; + else if (ISVALID(gotbno - ap->length, gotbno)) { + gotbno -= ap->length; + gotdiff += adjust - ap->length; } else gotdiff += adjust; /* @@ -2594,252 +3504,266 @@ xfs_bmap_adjacent( gotbno = NULLFSBLOCK; /* * If both valid, pick the better one, else the only good - * one, else ap->rval is already set (to 0 or the inode block). + * one, else ap->blkno is already set (to 0 or the inode block). */ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) - ap->rval = prevdiff <= gotdiff ? prevbno : gotbno; + ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno; else if (prevbno != NULLFSBLOCK) - ap->rval = prevbno; + ap->blkno = prevbno; else if (gotbno != NULLFSBLOCK) - ap->rval = gotbno; + ap->blkno = gotbno; } #undef ISVALID +} + +static int +xfs_bmap_longest_free_extent( + struct xfs_trans *tp, + xfs_agnumber_t ag, + xfs_extlen_t *blen, + int *notinit) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + xfs_extlen_t longest; + int error = 0; + + pag = xfs_perag_get(mp, ag); + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK); + if (error) + goto out; + + if (!pag->pagf_init) { + *notinit = 1; + goto out; + } + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (*blen < longest) + *blen = longest; + +out: + xfs_perag_put(pag); + return error; +} + +static void +xfs_bmap_select_minlen( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen, + int notinit) +{ + if (notinit || *blen < ap->minlen) { + /* + * Since we did a BUF_TRYLOCK above, it is possible that + * there is space for this request. + */ + args->minlen = ap->minlen; + } else if (*blen < args->maxlen) { + /* + * If the best seen length is less than the request length, + * use the best as the minimum. + */ + args->minlen = *blen; + } else { + /* + * Otherwise we've seen an extent as big as maxlen, use that + * as the minimum. + */ + args->minlen = args->maxlen; + } +} + +STATIC int +xfs_bmap_btalloc_nullfb( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) +{ + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag, startag; + int notinit = 0; + int error; + + args->type = XFS_ALLOCTYPE_START_BNO; + args->total = ap->total; + + startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (startag == NULLAGNUMBER) + startag = ag = 0; + + while (*blen < args->maxlen) { + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + + if (++ag == mp->m_sb.sb_agcount) + ag = 0; + if (ag == startag) + break; + } + + xfs_bmap_select_minlen(ap, args, blen, notinit); return 0; } STATIC int -xfs_bmap_rtalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ +xfs_bmap_btalloc_filestreams( + struct xfs_bmalloca *ap, + struct xfs_alloc_arg *args, + xfs_extlen_t *blen) { - xfs_alloctype_t atype = 0; /* type for allocation routines */ - int error; /* error return value */ - xfs_mount_t *mp; /* mount point structure */ - xfs_extlen_t prod = 0; /* product factor for allocators */ - xfs_extlen_t ralen = 0; /* realtime allocation length */ - xfs_extlen_t align; /* minimum allocation alignment */ - xfs_rtblock_t rtx; /* realtime extent number */ - xfs_rtblock_t rtb; + struct xfs_mount *mp = ap->ip->i_mount; + xfs_agnumber_t ag; + int notinit = 0; + int error; - mp = ap->ip->i_mount; - align = ap->ip->i_d.di_extsize ? - ap->ip->i_d.di_extsize : mp->m_sb.sb_rextsize; - prod = align / mp->m_sb.sb_rextsize; - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, - align, 1, ap->eof, 0, - ap->conv, &ap->off, &ap->alen); + args->type = XFS_ALLOCTYPE_NEAR_BNO; + args->total = ap->total; + + ag = XFS_FSB_TO_AGNO(mp, args->fsbno); + if (ag == NULLAGNUMBER) + ag = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, ¬init); if (error) return error; - ASSERT(ap->alen); - ASSERT(ap->alen % mp->m_sb.sb_rextsize == 0); - /* - * If the offset & length are not perfectly aligned - * then kill prod, it will just get us in trouble. - */ - if (do_mod(ap->off, align) || ap->alen % align) - prod = 1; - /* - * Set ralen to be the actual requested length in rtextents. - */ - ralen = ap->alen / mp->m_sb.sb_rextsize; - /* - * If the old value was close enough to MAXEXTLEN that - * we rounded up to it, cut it back so it's valid again. - * Note that if it's a really large request (bigger than - * MAXEXTLEN), we don't hear about that number, and can't - * adjust the starting point to match it. - */ - if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) - ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; - /* - * If it's an allocation to an empty file at offset 0, - * pick an extent that will space things out in the rt area. - */ - if (ap->eof && ap->off == 0) { - error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (*blen < args->maxlen) { + error = xfs_filestream_new_ag(ap, &ag); if (error) return error; - ap->rval = rtx * mp->m_sb.sb_rextsize; - } else { - ap->rval = 0; + + error = xfs_bmap_longest_free_extent(args->tp, ag, blen, + ¬init); + if (error) + return error; + } - xfs_bmap_adjacent(ap); + xfs_bmap_select_minlen(ap, args, blen, notinit); /* - * Realtime allocation, done through xfs_rtallocate_extent. + * Set the failure fallback case to look in the selected AG as stream + * may have moved. */ - atype = ap->rval == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; - do_div(ap->rval, mp->m_sb.sb_rextsize); - rtb = ap->rval; - ap->alen = ralen; - if ((error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, ap->alen, - &ralen, atype, ap->wasdel, prod, &rtb))) - return error; - if (rtb == NULLFSBLOCK && prod > 1 && - (error = xfs_rtallocate_extent(ap->tp, ap->rval, 1, - ap->alen, &ralen, atype, - ap->wasdel, 1, &rtb))) - return error; - ap->rval = rtb; - if (ap->rval != NULLFSBLOCK) { - ap->rval *= mp->m_sb.sb_rextsize; - ralen *= mp->m_sb.sb_rextsize; - ap->alen = ralen; - ap->ip->i_d.di_nblocks += ralen; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - if (ap->wasdel) - ap->ip->i_delayed_blks -= ralen; - /* - * Adjust the disk quota also. This was reserved - * earlier. - */ - XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, - ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : - XFS_TRANS_DQ_RTBCOUNT, (long) ralen); - } else { - ap->alen = 0; - } + ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0); return 0; } STATIC int xfs_bmap_btalloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { xfs_mount_t *mp; /* mount point structure */ xfs_alloctype_t atype = 0; /* type for allocation routines */ xfs_extlen_t align; /* minimum allocation alignment */ - xfs_agnumber_t ag; xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ - xfs_agnumber_t startag; + xfs_agnumber_t ag; xfs_alloc_arg_t args; xfs_extlen_t blen; - xfs_extlen_t delta; - xfs_extlen_t longest; - xfs_extlen_t need; xfs_extlen_t nextminlen = 0; - xfs_perag_t *pag; int nullfb; /* true if ap->firstblock isn't set */ int isaligned; - int notinit; int tryagain; int error; + int stripe_align; + + ASSERT(ap->length); mp = ap->ip->i_mount; - align = (ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)) ? - ap->ip->i_d.di_extsize : 0; + + /* stripe alignment for allocation is determined by mount parameters */ + stripe_align = 0; + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + stripe_align = mp->m_swidth; + else if (mp->m_dalign) + stripe_align = mp->m_dalign; + + align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0; if (unlikely(align)) { - error = xfs_bmap_extsize_align(mp, ap->gotp, ap->prevp, + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, - &ap->off, &ap->alen); + &ap->offset, &ap->length); ASSERT(!error); - ASSERT(ap->alen); + ASSERT(ap->length); } - nullfb = ap->firstblock == NULLFSBLOCK; - fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, ap->firstblock); - if (nullfb) - ap->rval = XFS_INO_TO_FSB(mp, ap->ip->i_ino); - else - ap->rval = ap->firstblock; + + + nullfb = *ap->firstblock == NULLFSBLOCK; + fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock); + if (nullfb) { + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) { + ag = xfs_filestream_lookup_ag(ap->ip); + ag = (ag != NULLAGNUMBER) ? ag : 0; + ap->blkno = XFS_AGB_TO_FSB(mp, ag, 0); + } else { + ap->blkno = XFS_INO_TO_FSB(mp, ap->ip->i_ino); + } + } else + ap->blkno = *ap->firstblock; xfs_bmap_adjacent(ap); /* - * If allowed, use ap->rval; otherwise must use firstblock since + * If allowed, use ap->blkno; otherwise must use firstblock since * it's in the right allocation group. */ - if (nullfb || XFS_FSB_TO_AGNO(mp, ap->rval) == fb_agno) + if (nullfb || XFS_FSB_TO_AGNO(mp, ap->blkno) == fb_agno) ; else - ap->rval = ap->firstblock; + ap->blkno = *ap->firstblock; /* * Normal allocation, done through xfs_alloc_vextent. */ tryagain = isaligned = 0; + memset(&args, 0, sizeof(args)); args.tp = ap->tp; args.mp = mp; - args.fsbno = ap->rval; - args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks); - args.firstblock = ap->firstblock; + args.fsbno = ap->blkno; + + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp)); + args.firstblock = *ap->firstblock; blen = 0; if (nullfb) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.total = ap->total; - /* - * Find the longest available space. - * We're going to try for the whole allocation at once. - */ - startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno); - notinit = 0; - down_read(&mp->m_peraglock); - while (blen < ap->alen) { - pag = &mp->m_perag[ag]; - if (!pag->pagf_init && - (error = xfs_alloc_pagf_init(mp, args.tp, - ag, XFS_ALLOC_FLAG_TRYLOCK))) { - up_read(&mp->m_peraglock); - return error; - } - /* - * See xfs_alloc_fix_freelist... - */ - if (pag->pagf_init) { - need = XFS_MIN_FREELIST_PAG(pag, mp); - delta = need > pag->pagf_flcount ? - need - pag->pagf_flcount : 0; - longest = (pag->pagf_longest > delta) ? - (pag->pagf_longest - delta) : - (pag->pagf_flcount > 0 || - pag->pagf_longest > 0); - if (blen < longest) - blen = longest; - } else - notinit = 1; - if (++ag == mp->m_sb.sb_agcount) - ag = 0; - if (ag == startag) - break; - } - up_read(&mp->m_peraglock); - /* - * Since the above loop did a BUF_TRYLOCK, it is - * possible that there is space for this request. - */ - if (notinit || blen < ap->minlen) - args.minlen = ap->minlen; /* - * If the best seen length is less than the request - * length, use the best as the minimum. - */ - else if (blen < ap->alen) - args.minlen = blen; - /* - * Otherwise we've seen an extent as big as alen, - * use that as the minimum. + * Search for an allocation group with a single extent large + * enough for the request. If one isn't found, then adjust + * the minimum allocation size to the largest space found. */ + if (ap->userdata && xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, &blen); else - args.minlen = ap->alen; - } else if (ap->low) { - args.type = XFS_ALLOCTYPE_START_BNO; + error = xfs_bmap_btalloc_nullfb(ap, &args, &blen); + if (error) + return error; + } else if (ap->flist->xbf_low) { + if (xfs_inode_is_filestream(ap->ip)) + args.type = XFS_ALLOCTYPE_FIRST_AG; + else + args.type = XFS_ALLOCTYPE_START_BNO; args.total = args.minlen = ap->minlen; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.total = ap->total; args.minlen = ap->minlen; } - if (unlikely(ap->userdata && ap->ip->i_d.di_extsize && - (ap->ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE))) { - args.prod = ap->ip->i_d.di_extsize; - if ((args.mod = (xfs_extlen_t)do_mod(ap->off, args.prod))) + /* apply extent size hints if obtained earlier */ + if (unlikely(align)) { + args.prod = align; + if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); - } else if (mp->m_sb.sb_blocksize >= NBPP) { + } else if (mp->m_sb.sb_blocksize >= PAGE_CACHE_SIZE) { args.prod = 1; args.mod = 0; } else { - args.prod = NBPP >> mp->m_sb.sb_blocklog; - if ((args.mod = (xfs_extlen_t)(do_mod(ap->off, args.prod)))) + args.prod = PAGE_CACHE_SIZE >> mp->m_sb.sb_blocklog; + if ((args.mod = (xfs_extlen_t)(do_mod(ap->offset, args.prod)))) args.mod = (xfs_extlen_t)(args.prod - args.mod); } /* @@ -2851,15 +3775,15 @@ xfs_bmap_btalloc( * is >= the stripe unit and the allocation offset is * at the end of file. */ - if (!ap->low && ap->aeof) { - if (!ap->off) { - args.alignment = mp->m_dalign; + if (!ap->flist->xbf_low && ap->aeof) { + if (!ap->offset) { + args.alignment = stripe_align; atype = args.type; isaligned = 1; /* * Adjust for alignment */ - if (blen > args.alignment && blen <= ap->alen) + if (blen > args.alignment && blen <= args.maxlen) args.minlen = blen - args.alignment; args.minalignslop = 0; } else { @@ -2878,13 +3802,13 @@ xfs_bmap_btalloc( * of minlen+alignment+slop doesn't go up * between the calls. */ - if (blen > mp->m_dalign && blen <= ap->alen) - nextminlen = blen - mp->m_dalign; + if (blen > stripe_align && blen <= args.maxlen) + nextminlen = blen - stripe_align; else nextminlen = args.minlen; - if (nextminlen + mp->m_dalign > args.minlen + 1) + if (nextminlen + stripe_align > args.minlen + 1) args.minalignslop = - nextminlen + mp->m_dalign - + nextminlen + stripe_align - args.minlen - 1; else args.minalignslop = 0; @@ -2905,8 +3829,8 @@ xfs_bmap_btalloc( * turned on. */ args.type = atype; - args.fsbno = ap->rval; - args.alignment = mp->m_dalign; + args.fsbno = ap->blkno; + args.alignment = stripe_align; args.minlen = nextminlen; args.minalignslop = 0; isaligned = 1; @@ -2919,7 +3843,7 @@ xfs_bmap_btalloc( * try again. */ args.type = atype; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; args.alignment = 0; if ((error = xfs_alloc_vextent(&args))) return error; @@ -2928,7 +3852,7 @@ xfs_bmap_btalloc( args.minlen > ap->minlen) { args.minlen = ap->minlen; args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = ap->rval; + args.fsbno = ap->blkno; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -2939,13 +3863,26 @@ xfs_bmap_btalloc( args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; - ap->low = 1; + ap->flist->xbf_low = 1; } if (args.fsbno != NULLFSBLOCK) { - ap->firstblock = ap->rval = args.fsbno; + /* + * check the allocation happened at the same or higher AG than + * the first block that was allocated. + */ + ASSERT(*ap->firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *ap->firstblock) == + XFS_FSB_TO_AGNO(mp, args.fsbno) || + (ap->flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *ap->firstblock) < + XFS_FSB_TO_AGNO(mp, args.fsbno))); + + ap->blkno = args.fsbno; + if (*ap->firstblock == NULLFSBLOCK) + *ap->firstblock = args.fsbno; ASSERT(nullfb || fb_agno == args.agno || - (ap->low && fb_agno < args.agno)); - ap->alen = args.len; + (ap->flist->xbf_low && fb_agno < args.agno)); + ap->length = args.len; ap->ip->i_d.di_nblocks += args.len; xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); if (ap->wasdel) @@ -2954,13 +3891,13 @@ xfs_bmap_btalloc( * Adjust the disk quota also. This was reserved * earlier. */ - XFS_TRANS_MOD_DQUOT_BYINO(mp, ap->tp, ap->ip, + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT, (long) args.len); } else { - ap->rval = NULLFSBLOCK; - ap->alen = 0; + ap->blkno = NULLFSBLOCK; + ap->length = 0; } return 0; } @@ -2971,73 +3908,803 @@ xfs_bmap_btalloc( */ STATIC int xfs_bmap_alloc( - xfs_bmalloca_t *ap) /* bmap alloc argument struct */ + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if ((ap->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && ap->userdata) + if (XFS_IS_REALTIME_INODE(ap->ip) && ap->userdata) return xfs_bmap_rtalloc(ap); return xfs_bmap_btalloc(ap); } /* - * Transform a btree format file with only one leaf node, where the - * extents list will fit in the inode, into an extents format file. - * Since the file extents are already in-core, all we have to do is - * give up the space for the btree root and pitch the leaf block. + * Trim the returned map to the required bounds */ -STATIC int /* error */ -xfs_bmap_btree_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ +STATIC void +xfs_bmapi_trim_map( + struct xfs_bmbt_irec *mval, + struct xfs_bmbt_irec *got, + xfs_fileoff_t *bno, + xfs_filblks_t len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int n, + int flags) { - /* REFERENCED */ - xfs_bmbt_block_t *cblock;/* child btree block */ - xfs_fsblock_t cbno; /* child block number */ - xfs_buf_t *cbp; /* child block's buffer */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork data */ - xfs_mount_t *mp; /* mount point structure */ - xfs_bmbt_ptr_t *pp; /* ptr to block address */ - xfs_bmbt_block_t *rblock;/* root btree block */ + if ((flags & XFS_BMAPI_ENTIRE) || + got->br_startoff + got->br_blockcount <= obno) { + *mval = *got; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + return; + } + + if (obno > *bno) + *bno = obno; + ASSERT((*bno >= obno) || (n == 0)); + ASSERT(*bno < end); + mval->br_startoff = *bno; + if (isnullstartblock(got->br_startblock)) + mval->br_startblock = DELAYSTARTBLOCK; + else + mval->br_startblock = got->br_startblock + + (*bno - got->br_startoff); + /* + * Return the minimum of what we got and what we asked for for + * the length. We can use the len variable here because it is + * modified below and we could have been there before coming + * here if the first part of the allocation didn't overlap what + * was asked for. + */ + mval->br_blockcount = XFS_FILBLKS_MIN(end - *bno, + got->br_blockcount - (*bno - got->br_startoff)); + mval->br_state = got->br_state; + ASSERT(mval->br_blockcount <= len); + return; +} + +/* + * Update and validate the extent map to return + */ +STATIC void +xfs_bmapi_update_map( + struct xfs_bmbt_irec **map, + xfs_fileoff_t *bno, + xfs_filblks_t *len, + xfs_fileoff_t obno, + xfs_fileoff_t end, + int *n, + int flags) +{ + xfs_bmbt_irec_t *mval = *map; + + ASSERT((flags & XFS_BMAPI_ENTIRE) || + ((mval->br_startoff + mval->br_blockcount) <= end)); + ASSERT((flags & XFS_BMAPI_ENTIRE) || (mval->br_blockcount <= *len) || + (mval->br_startoff < obno)); + + *bno = mval->br_startoff + mval->br_blockcount; + *len = end - *bno; + if (*n > 0 && mval->br_startoff == mval[-1].br_startoff) { + /* update previous map with new information */ + ASSERT(mval->br_startblock == mval[-1].br_startblock); + ASSERT(mval->br_blockcount > mval[-1].br_blockcount); + ASSERT(mval->br_state == mval[-1].br_state); + mval[-1].br_blockcount = mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (*n > 0 && mval->br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != DELAYSTARTBLOCK && + mval[-1].br_startblock != HOLESTARTBLOCK && + mval->br_startblock == mval[-1].br_startblock + + mval[-1].br_blockcount && + ((flags & XFS_BMAPI_IGSTATE) || + mval[-1].br_state == mval->br_state)) { + ASSERT(mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount); + mval[-1].br_blockcount += mval->br_blockcount; + } else if (*n > 0 && + mval->br_startblock == DELAYSTARTBLOCK && + mval[-1].br_startblock == DELAYSTARTBLOCK && + mval->br_startoff == + mval[-1].br_startoff + mval[-1].br_blockcount) { + mval[-1].br_blockcount += mval->br_blockcount; + mval[-1].br_state = mval->br_state; + } else if (!((*n == 0) && + ((mval->br_startoff + mval->br_blockcount) <= + obno))) { + mval++; + (*n)++; + } + *map = mval; +} + +/* + * Map file blocks to filesystem blocks without allocation. + */ +int +xfs_bmapi_read( + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + struct xfs_bmbt_irec *mval, + int *nmap, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec prev; + xfs_fileoff_t obno; + xfs_fileoff_t end; + xfs_extnum_t lastx; + int error; + int eof; + int n = 0; + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE| + XFS_BMAPI_IGSTATE))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - rblock = ifp->if_broot; - ASSERT(be16_to_cpu(rblock->bb_level) == 1); - ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1); - ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1); - mp = ip->i_mount; - pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes); - *logflagsp = 0; -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), 1))) + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, whichfork); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + /* Reading past eof, act as though there's a hole up to end. */ + if (eof) + got.br_startoff = end; + if (got.br_startoff > bno) { + /* Reading in a hole. */ + mval->br_startoff = bno; + mval->br_startblock = HOLESTARTBLOCK; + mval->br_blockcount = + XFS_FILBLKS_MIN(len, got.br_startoff - bno); + mval->br_state = XFS_EXT_NORM; + bno += mval->br_blockcount; + len -= mval->br_blockcount; + mval++; + n++; + continue; + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + *nmap = n; + return 0; +} + +STATIC int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + xfs_fileoff_t aoff, + xfs_filblks_t len, + struct xfs_bmbt_irec *got, + struct xfs_bmbt_irec *prev, + xfs_extnum_t *lastx, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + xfs_extlen_t alen; + xfs_extlen_t indlen; + char rt = XFS_IS_REALTIME_INODE(ip); + xfs_extlen_t extsz; + int error; + + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + + /* Figure out the extent size, adjust alen */ + extsz = xfs_get_extsz_hint(ip); + if (extsz) { + /* + * Make sure we don't exceed a single extent length when we + * align the extent by reducing length we are going to + * allocate by the maximum amount extent size aligment may + * require. + */ + alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); + error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + if (rt) + extsz = alen / mp->m_sb.sb_rextsize; + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_trans_reserve_quota_nblks(NULL, ip, (long)alen, 0, + rt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) return error; -#endif - cbno = INT_GET(*pp, ARCH_CONVERT); - if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, - XFS_BMAP_BTREE_REF))) + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + if (rt) { + error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, + -((int64_t)extsz), 0); + } else { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)alen), 0); + } + + if (error) + goto out_unreserve_quota; + + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + -((int64_t)indlen), 0); + if (error) + goto out_unreserve_blocks; + + + ip->i_delayed_blks += alen; + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, lastx, got); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay + * might have merged it into one of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + + ASSERT(got->br_startoff <= aoff); + ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); + ASSERT(isnullstartblock(got->br_startblock)); + ASSERT(got->br_state == XFS_EXT_NORM); + return 0; + +out_unreserve_blocks: + if (rt) + xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + else + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + return error; +} + +/* + * Map file blocks to filesystem blocks, adding delayed allocations as needed. + */ +int +xfs_bmapi_delay( + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + int flags) /* XFS_BMAPI_... */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_bmbt_irec got; /* current file extent record */ + struct xfs_bmbt_irec prev; /* previous file extent record */ + xfs_fileoff_t obno; /* old block number (offset) */ + xfs_fileoff_t end; /* end of mapped file region */ + xfs_extnum_t lastx; /* last useful extent number */ + int eof; /* we've hit the end of extents */ + int n = 0; /* current extent index */ + int error = 0; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & ~XFS_BMAPI_ENTIRE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_blk_mapw); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + xfs_bmap_search_extents(ip, bno, XFS_DATA_FORK, &eof, &lastx, &got, &prev); + end = bno + len; + obno = bno; + + while (bno < end && n < *nmap) { + if (eof || got.br_startoff > bno) { + error = xfs_bmapi_reserve_delalloc(ip, bno, len, &got, + &prev, &lastx, eof); + if (error) { + if (n == 0) { + *nmap = 0; + return error; + } + break; + } + } + + /* set up the extent map to return. */ + xfs_bmapi_trim_map(mval, &got, &bno, len, obno, end, n, flags); + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* If we're done, stop now. */ + if (bno >= end || n >= *nmap) + break; + + /* Else go on to the next record. */ + prev = got; + if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); + else + eof = 1; + } + + *nmap = n; + return 0; +} + + +static int +xfs_bmapi_allocate( + struct xfs_bmalloca *bma) +{ + struct xfs_mount *mp = bma->ip->i_mount; + int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + ASSERT(bma->length > 0); + + /* + * For the wasdelay case, we could also just allocate the stuff asked + * for in this bmap call but that wouldn't be as good. + */ + if (bma->wasdel) { + bma->length = (xfs_extlen_t)bma->got.br_blockcount; + bma->offset = bma->got.br_startoff; + if (bma->idx != NULLEXTNUM && bma->idx) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), + &bma->prev); + } + } else { + bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN); + if (!bma->eof) + bma->length = XFS_FILBLKS_MIN(bma->length, + bma->got.br_startoff - bma->offset); + } + + /* + * Indicate if this is the first user data in the file, or just any + * user data. + */ + if (!(bma->flags & XFS_BMAPI_METADATA)) { + bma->userdata = (bma->offset == 0) ? + XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + } + + bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; + + /* + * Only want to do the alignment at the eof if it is userdata and + * allocation length is larger than a stripe unit. + */ + if (mp->m_dalign && bma->length >= mp->m_dalign && + !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) { + error = xfs_bmap_isaeof(bma, whichfork); + if (error) + return error; + } + + error = xfs_bmap_alloc(bma); + if (error) return error; - cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); - if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp))) + + if (bma->flist->xbf_low) + bma->minleft = 0; + if (bma->cur) + bma->cur->bc_private.b.firstblock = *bma->firstblock; + if (bma->blkno == NULLFSBLOCK) + return 0; + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(mp, bma->tp, bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + /* + * Bump the number of extents we've allocated + * in this call. + */ + bma->nallocs++; + + if (bma->cur) + bma->cur->bc_private.b.flags = + bma->wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; + + bma->got.br_startoff = bma->offset; + bma->got.br_startblock = bma->blkno; + bma->got.br_blockcount = bma->length; + bma->got.br_state = XFS_EXT_NORM; + + /* + * A wasdelay extent has been initialized, so shouldn't be flagged + * as unwritten. + */ + if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + bma->got.br_state = XFS_EXT_UNWRITTEN; + + if (bma->wasdel) + error = xfs_bmap_add_extent_delay_real(bma); + else + error = xfs_bmap_add_extent_hole_real(bma, whichfork); + + bma->logflags |= tmp_logflags; + if (error) return error; - xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); - ip->i_d.di_nblocks--; - XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; - xfs_iroot_realloc(ip, -1, whichfork); - ASSERT(ifp->if_broot == NULL); - ASSERT((ifp->if_flags & XFS_IFBROOT) == 0); - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FEXT(whichfork); + + /* + * Update our extent pointer, given that xfs_bmap_add_extent_delay_real + * or xfs_bmap_add_extent_hole_real might have merged it into one of + * the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + ASSERT(bma->got.br_startoff <= bma->offset); + ASSERT(bma->got.br_startoff + bma->got.br_blockcount >= + bma->offset + bma->length); + ASSERT(bma->got.br_state == XFS_EXT_NORM || + bma->got.br_state == XFS_EXT_UNWRITTEN); + return 0; +} + +STATIC int +xfs_bmapi_convert_unwritten( + struct xfs_bmalloca *bma, + struct xfs_bmbt_irec *mval, + xfs_filblks_t len, + int flags) +{ + int whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork); + int tmp_logflags = 0; + int error; + + /* check if we need to do unwritten->real conversion */ + if (mval->br_state == XFS_EXT_UNWRITTEN && + (flags & XFS_BMAPI_PREALLOC)) + return 0; + + /* check if we need to do real->unwritten conversion */ + if (mval->br_state == XFS_EXT_NORM && + (flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT)) + return 0; + + /* + * Modify (by adding) the state flag, if writing. + */ + ASSERT(mval->br_blockcount <= len); + if ((ifp->if_flags & XFS_IFBROOT) && !bma->cur) { + bma->cur = xfs_bmbt_init_cursor(bma->ip->i_mount, bma->tp, + bma->ip, whichfork); + bma->cur->bc_private.b.firstblock = *bma->firstblock; + bma->cur->bc_private.b.flist = bma->flist; + } + mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) + ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, + &bma->cur, mval, bma->firstblock, bma->flist, + &tmp_logflags); + bma->logflags |= tmp_logflags; + if (error) + return error; + + /* + * Update our extent pointer, given that + * xfs_bmap_add_extent_unwritten_real might have merged it into one + * of the neighbouring ones. + */ + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &bma->got); + + /* + * We may have combined previously unwritten space with written space, + * so generate another request. + */ + if (mval->br_blockcount < len) + return EAGAIN; return 0; } /* + * Map file blocks to filesystem blocks, and allocate blocks or convert the + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. + * + * The returned value in "firstblock" from the first call in a transaction + * must be remembered and presented to subsequent calls in "firstblock". + * An upper bound for the number of blocks to be allocated is supplied to + * the first call in "total"; if no allocation group has that many free + * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). + */ +int +xfs_bmapi_write( + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t bno, /* starting file offs. mapped */ + xfs_filblks_t len, /* length to map in file */ + int flags, /* XFS_BMAPI_... */ + xfs_fsblock_t *firstblock, /* first allocated block + controls a.g. for allocs */ + xfs_extlen_t total, /* total blocks needed */ + struct xfs_bmbt_irec *mval, /* output: map values */ + int *nmap, /* i/o: mval size/count */ + struct xfs_bmap_free *flist) /* i/o: list extents to free */ +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ + xfs_fileoff_t end; /* end of mapped file region */ + int eof; /* after the end of extents */ + int error; /* error return */ + int n; /* current extent index */ + xfs_fileoff_t obno; /* old block number (offset) */ + int whichfork; /* data or attr fork */ + char inhole; /* current location is hole in file */ + char wasdelay; /* old extent was delayed */ + +#ifdef DEBUG + xfs_fileoff_t orig_bno; /* original block number value */ + int orig_flags; /* original flags arg value */ + xfs_filblks_t orig_len; /* original value of len arg */ + struct xfs_bmbt_irec *orig_mval; /* original value of mval */ + int orig_nmap; /* original value of *nmap */ + + orig_bno = bno; + orig_len = len; + orig_flags = flags; + orig_mval = mval; + orig_nmap = *nmap; +#endif + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + + ASSERT(*nmap >= 1); + ASSERT(*nmap <= XFS_BMAP_MAX_NMAP); + ASSERT(!(flags & XFS_BMAPI_IGSTATE)); + ASSERT(tp != NULL); + ASSERT(len > 0); + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + ifp = XFS_IFORK_PTR(ip, whichfork); + + XFS_STATS_INC(xs_blk_mapw); + + if (*firstblock == NULLFSBLOCK) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) + bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; + else + bma.minleft = 1; + } else { + bma.minleft = 0; + } + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + goto error0; + } + + xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, + &bma.prev); + n = 0; + end = bno + len; + obno = bno; + + bma.tp = tp; + bma.ip = ip; + bma.total = total; + bma.userdata = 0; + bma.flist = flist; + bma.firstblock = firstblock; + + while (bno < end && n < *nmap) { + inhole = eof || bma.got.br_startoff > bno; + wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + + /* + * First, deal with the hole before the allocated space + * that we found, if any. + */ + if (inhole || wasdelay) { + bma.eof = eof; + bma.conv = !!(flags & XFS_BMAPI_CONVERT); + bma.wasdel = wasdelay; + bma.offset = bno; + bma.flags = flags; + + /* + * There's a 32/64 bit type mismatch between the + * allocation length request (which can be 64 bits in + * length) and the bma length request, which is + * xfs_extlen_t and therefore 32 bits. Hence we have to + * check for 32-bit overflows and handle them here. + */ + if (len > (xfs_filblks_t)MAXEXTLEN) + bma.length = MAXEXTLEN; + else + bma.length = len; + + ASSERT(len > 0); + ASSERT(bma.length > 0); + error = xfs_bmapi_allocate(&bma); + if (error) + goto error0; + if (bma.blkno == NULLFSBLOCK) + break; + } + + /* Deal with the allocated space we found. */ + xfs_bmapi_trim_map(mval, &bma.got, &bno, len, obno, + end, n, flags); + + /* Execute unwritten extent conversion if necessary */ + error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); + if (error == EAGAIN) + continue; + if (error) + goto error0; + + /* update the extent map to return */ + xfs_bmapi_update_map(&mval, &bno, &len, obno, end, &n, flags); + + /* + * If we're done, stop now. Stop when we've allocated + * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise + * the transaction may get too big. + */ + if (bno >= end || n >= *nmap || bma.nallocs >= *nmap) + break; + + /* Else go on to the next record. */ + bma.prev = bma.got; + if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), + &bma.got); + } else + eof = 1; + } + *nmap = n; + + /* + * Transform from btree to extents, give it cur. + */ + if (xfs_bmap_wants_extents(ip, whichfork)) { + int tmp_logflags = 0; + + ASSERT(bma.cur); + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, + &tmp_logflags, whichfork); + bma.logflags |= tmp_logflags; + if (error) + goto error0; + } + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || + XFS_IFORK_NEXTENTS(ip, whichfork) > + XFS_IFORK_MAXEXT(ip, whichfork)); + error = 0; +error0: + /* + * Log everything. Do this after conversion, there's no point in + * logging the extent records if we've converted to btree format. + */ + if ((bma.logflags & xfs_ilog_fext(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) + bma.logflags &= ~xfs_ilog_fext(whichfork); + else if ((bma.logflags & xfs_ilog_fbroot(whichfork)) && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) + bma.logflags &= ~xfs_ilog_fbroot(whichfork); + /* + * Log whatever the flags say, even if error. Otherwise we might miss + * detecting a case where the data is changed, there's an error, + * and it's not logged so we don't shutdown when we should. + */ + if (bma.logflags) + xfs_trans_log_inode(tp, ip, bma.logflags); + + if (bma.cur) { + if (!error) { + ASSERT(*firstblock == NULLFSBLOCK || + XFS_FSB_TO_AGNO(mp, *firstblock) == + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock) || + (flist->xbf_low && + XFS_FSB_TO_AGNO(mp, *firstblock) < + XFS_FSB_TO_AGNO(mp, + bma.cur->bc_private.b.firstblock))); + *firstblock = bma.cur->bc_private.b.firstblock; + } + xfs_btree_del_cursor(bma.cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + if (!error) + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, + orig_nmap, *nmap); + return error; +} + +/* * Called by xfs_bmapi to update file extent records and the btree * after removing space (or undoing a delayed allocation). */ @@ -3045,14 +4712,12 @@ STATIC int /* error */ xfs_bmap_del_extent( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ - xfs_extnum_t idx, /* extent number to update/delete */ + xfs_extnum_t *idx, /* extent number to update/delete */ xfs_bmap_free_t *flist, /* list of extents to be freed */ xfs_btree_cur_t *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ - xfs_extdelta_t *delta, /* Change made to incore extents */ - int whichfork, /* data or attr fork */ - int rsvd) /* OK to allocate reserved blocks */ + int whichfork) /* data or attr fork */ { xfs_filblks_t da_new; /* new delay-alloc indirect blocks */ xfs_filblks_t da_old; /* old delay-alloc indirect blocks */ @@ -3060,12 +4725,9 @@ xfs_bmap_del_extent( xfs_fileoff_t del_endoff; /* first offset past del */ int delay; /* current block is delayed allocated */ int do_fx; /* free extent at end of routine */ - xfs_bmbt_rec_t *ep; /* current extent entry pointer */ + xfs_bmbt_rec_host_t *ep; /* current extent entry pointer */ int error; /* error return value */ int flags; /* inode logging flags */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_del_extent"; -#endif xfs_bmbt_irec_t got; /* current extent entry */ xfs_fileoff_t got_endoff; /* first offset past got */ int i; /* temp state */ @@ -3077,21 +4739,26 @@ xfs_bmap_del_extent( uint qfield; /* quota field to update */ xfs_filblks_t temp; /* for indirect length calculations */ xfs_filblks_t temp2; /* for indirect length calculations */ + int state = 0; XFS_STATS_INC(xs_del_exlist); + + if (whichfork == XFS_ATTR_FORK) + state |= BMAP_ATTRFORK; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((idx >= 0) && (idx < ifp->if_bytes / + ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); ASSERT(del->br_blockcount > 0); - ep = xfs_iext_get_ext(ifp, idx); + ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); ASSERT(got.br_startoff <= del->br_startoff); del_endoff = del->br_startoff + del->br_blockcount; got_endoff = got.br_startoff + got.br_blockcount; ASSERT(got_endoff >= del_endoff); - delay = ISNULLSTARTBLOCK(got.br_startblock); - ASSERT(ISNULLSTARTBLOCK(del->br_startblock) == delay); + delay = isnullstartblock(got.br_startblock); + ASSERT(isnullstartblock(del->br_startblock) == delay); flags = 0; qfield = 0; error = 0; @@ -3103,8 +4770,7 @@ xfs_bmap_del_extent( /* * Realtime allocation. Free it and record di_nblocks update. */ - if (whichfork == XFS_DATA_FORK && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { + if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { xfs_fsblock_t bno; xfs_filblks_t len; @@ -3116,8 +4782,8 @@ xfs_bmap_del_extent( len = del->br_blockcount; do_div(bno, mp->m_sb.sb_rextsize); do_div(len, mp->m_sb.sb_rextsize); - if ((error = xfs_rtfree_extent(ip->i_transp, bno, - (xfs_extlen_t)len))) + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) goto done; do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; @@ -3140,11 +4806,11 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } da_old = da_new = 0; } else { - da_old = STARTBLOCKVAL(got.br_startblock); + da_old = startblockval(got.br_startblock); da_new = 0; nblks = 0; do_fx = 0; @@ -3159,45 +4825,44 @@ xfs_bmap_del_extent( /* * Matches the whole extent. Delete the entry. */ - xfs_bmap_trace_delete(fname, "3", ip, idx, 1, whichfork); - xfs_iext_remove(ifp, idx, 1); - ifp->if_lastex = idx; + xfs_iext_remove(ip, *idx, 1, + whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0); + --*idx; if (delay) break; + XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) - 1); flags |= XFS_ILOG_CORE; if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } - if ((error = xfs_bmbt_delete(cur, &i))) + if ((error = xfs_btree_delete(cur, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); break; case 2: /* * Deleting the first part of the extent. */ - xfs_bmap_trace_pre_update(fname, "2", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_startoff(ep, del_endoff); temp = got.br_blockcount - del->br_blockcount; xfs_bmbt_set_blockcount(ep, temp); - ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "2", ip, idx, - whichfork); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); da_new = temp; break; } xfs_bmbt_set_startblock(ep, del_endblock); - xfs_bmap_trace_post_update(fname, "2", ip, idx, whichfork); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, del_endoff, del_endblock, @@ -3211,21 +4876,19 @@ xfs_bmap_del_extent( * Deleting the last part of the extent. */ temp = got.br_blockcount - del->br_blockcount; - xfs_bmap_trace_pre_update(fname, "1", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); - ifp->if_lastex = idx; if (delay) { temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), da_old); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); - xfs_bmap_trace_post_update(fname, "1", ip, idx, - whichfork); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); da_new = temp; break; } - xfs_bmap_trace_post_update(fname, "1", ip, idx, whichfork); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); if (!cur) { - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); break; } if ((error = xfs_bmbt_update(cur, got.br_startoff, @@ -3240,7 +4903,7 @@ xfs_bmap_del_extent( * Deleting the middle of the extent. */ temp = del->br_startoff - got.br_startoff; - xfs_bmap_trace_pre_update(fname, "0", ip, idx, whichfork); + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); xfs_bmbt_set_blockcount(ep, temp); new.br_startoff = del_endoff; temp2 = got_endoff - del_endoff; @@ -3255,10 +4918,10 @@ xfs_bmap_del_extent( got.br_startblock, temp, got.br_state))) goto done; - if ((error = xfs_bmbt_increment(cur, 0, &i))) + if ((error = xfs_btree_increment(cur, 0, &i))) goto done; cur->bc_rec.b = new; - error = xfs_bmbt_insert(cur, &i); + error = xfs_btree_insert(cur, &i); if (error && error != ENOSPC) goto done; /* @@ -3277,7 +4940,7 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); /* * Update the btree record back * to the original value. @@ -3298,24 +4961,24 @@ xfs_bmap_del_extent( error = XFS_ERROR(ENOSPC); goto done; } - ASSERT(i == 1); + XFS_WANT_CORRUPTED_GOTO(i == 1, done); } else - flags |= XFS_ILOG_FEXT(whichfork); + flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, XFS_IFORK_NEXTENTS(ip, whichfork) + 1); } else { ASSERT(whichfork == XFS_DATA_FORK); temp = xfs_bmap_worst_indlen(ip, temp); - xfs_bmbt_set_startblock(ep, NULLSTARTBLOCK((int)temp)); + xfs_bmbt_set_startblock(ep, nullstartblock((int)temp)); temp2 = xfs_bmap_worst_indlen(ip, temp2); - new.br_startblock = NULLSTARTBLOCK((int)temp2); + new.br_startblock = nullstartblock((int)temp2); da_new = temp + temp2; while (da_new > da_old) { if (temp) { temp--; da_new--; xfs_bmbt_set_startblock(ep, - NULLSTARTBLOCK((int)temp)); + nullstartblock((int)temp)); } if (da_new == da_old) break; @@ -3323,15 +4986,13 @@ xfs_bmap_del_extent( temp2--; da_new--; new.br_startblock = - NULLSTARTBLOCK((int)temp2); + nullstartblock((int)temp2); } } } - xfs_bmap_trace_post_update(fname, "0", ip, idx, whichfork); - xfs_bmap_trace_insert(fname, "0", ip, idx + 1, 1, &new, NULL, - whichfork); - xfs_iext_insert(ifp, idx + 1, 1, &new); - ifp->if_lastex = idx + 1; + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); + xfs_iext_insert(ip, *idx + 1, 1, &new, state); + ++*idx; break; } /* @@ -3349,2014 +5010,23 @@ xfs_bmap_del_extent( * Adjust quota data. */ if (qfield) - XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, qfield, (long)-nblks); + xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks); /* * Account for change in delayed indirect blocks. * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, (int)(da_old - da_new), - rsvd); - if (delta) { - /* DELTA: report the original extent. */ - if (delta->xed_startoff > got.br_startoff) - delta->xed_startoff = got.br_startoff; - if (delta->xed_blockcount < got.br_startoff+got.br_blockcount) - delta->xed_blockcount = got.br_startoff + - got.br_blockcount; - } -done: - *logflagsp = flags; - return error; -} - -/* - * Remove the entry "free" from the free item list. Prev points to the - * previous entry, unless "free" is the head of the list. - */ -STATIC void -xfs_bmap_del_free( - xfs_bmap_free_t *flist, /* free item list header */ - xfs_bmap_free_item_t *prev, /* previous item on list, if any */ - xfs_bmap_free_item_t *free) /* list item to be freed */ -{ - if (prev) - prev->xbfi_next = free->xbfi_next; - else - flist->xbf_first = free->xbfi_next; - flist->xbf_count--; - kmem_zone_free(xfs_bmap_free_item_zone, free); -} - -/* - * Convert an extents-format file into a btree-format file. - * The new file will have a root block (in the inode) and a single child block. - */ -STATIC int /* error */ -xfs_bmap_extents_to_btree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first-block-allocated */ - xfs_bmap_free_t *flist, /* blocks freed in xaction */ - xfs_btree_cur_t **curp, /* cursor returned to caller */ - int wasdel, /* converting a delayed alloc */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_block_t *ablock; /* allocated (child) bt block */ - xfs_buf_t *abp; /* buffer for ablock */ - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_rec_t *arp; /* child record pointer */ - xfs_bmbt_block_t *block; /* btree root block */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - int error; /* error return value */ - xfs_extnum_t i, cnt; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_key_t *kp; /* root block key pointer */ - xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_ptr_t *pp; /* root block address pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - /* - * Make space in the inode incore. - */ - xfs_iroot_realloc(ip, 1, whichfork); - ifp->if_flags |= XFS_IFBROOT; - /* - * Fill in the root. - */ - block = ifp->if_broot; - block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - block->bb_level = cpu_to_be16(1); - block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be64(NULLDFSBNO); - block->bb_rightsib = cpu_to_be64(NULLDFSBNO); - /* - * Need a cursor. Can't allocate until bb_level is filled in. - */ - mp = ip->i_mount; - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - whichfork); - cur->bc_private.b.firstblock = *firstblock; - cur->bc_private.b.flist = flist; - cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0; - /* - * Convert to a btree with two levels, one record in root. - */ - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE); - args.tp = tp; - args.mp = mp; - args.firstblock = *firstblock; - if (*firstblock == NULLFSBLOCK) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); - } else if (flist->xbf_low) { - args.type = XFS_ALLOCTYPE_START_BNO; - args.fsbno = *firstblock; - } else { - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.fsbno = *firstblock; - } - args.minlen = args.maxlen = args.prod = 1; - args.total = args.minleft = args.alignment = args.mod = args.isfl = - args.minalignslop = 0; - args.wasdel = wasdel; - *logflagsp = 0; - if ((error = xfs_alloc_vextent(&args))) { - xfs_iroot_realloc(ip, -1, whichfork); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - /* - * Allocation can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(*firstblock == NULLFSBLOCK || - args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) || - (flist->xbf_low && - args.agno > XFS_FSB_TO_AGNO(mp, *firstblock))); - *firstblock = cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - ip->i_d.di_nblocks++; - XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, 1L); - abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0); - /* - * Fill in the child block. - */ - ablock = XFS_BUF_TO_BMBT_BLOCK(abp); - ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - ablock->bb_level = 0; - ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO); - ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO); - arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (cnt = i = 0; i < nextents; i++) { - ep = xfs_iext_get_ext(ifp, i); - if (!ISNULLSTARTBLOCK(xfs_bmbt_get_startblock(ep))) { - arp->l0 = INT_GET(ep->l0, ARCH_CONVERT); - arp->l1 = INT_GET(ep->l1, ARCH_CONVERT); - arp++; cnt++; - } - } - ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork)); - ablock->bb_numrecs = cpu_to_be16(cnt); - /* - * Fill in the root key and pointer. - */ - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - arp = XFS_BMAP_REC_IADDR(ablock, 1, cur); - INT_SET(kp->br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(arp)); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - INT_SET(*pp, ARCH_CONVERT, args.fsbno); - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS); - xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs)); - ASSERT(*curp == NULL); - *curp = cur; - *logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork); - return 0; -} - -/* - * Helper routine to reset inode di_forkoff field when switching - * attribute fork from local to extent format - we reset it where - * possible to make space available for inline data fork extents. - */ -STATIC void -xfs_bmap_forkoff_reset( - xfs_mount_t *mp, - xfs_inode_t *ip, - int whichfork) -{ - if (whichfork == XFS_ATTR_FORK && - (ip->i_d.di_format != XFS_DINODE_FMT_DEV) && - (ip->i_d.di_format != XFS_DINODE_FMT_UUID) && - ((mp->m_attroffset >> 3) > ip->i_d.di_forkoff)) { - ip->i_d.di_forkoff = mp->m_attroffset >> 3; - ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / - (uint)sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_ext_max = XFS_IFORK_ASIZE(ip) / - (uint)sizeof(xfs_bmbt_rec_t); - } -} - -/* - * Convert a local file to an extents file. - * This code is out of bounds for data forks of regular files, - * since the file data needs to get logged so things will stay consistent. - * (The bmap-level manipulations are ok, though). - */ -STATIC int /* error */ -xfs_bmap_local_to_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fsblock_t *firstblock, /* first block allocated in xaction */ - xfs_extlen_t total, /* total blocks needed by transaction */ - int *logflagsp, /* inode logging flags */ - int whichfork) /* data or attr fork */ -{ - int error; /* error return value */ - int flags; /* logging flags returned */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_local_to_extents"; -#endif - xfs_ifork_t *ifp; /* inode fork pointer */ - - /* - * We don't want to deal with the case of keeping inode data inline yet. - * So sending the data fork of a regular inode is invalid. - */ - ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && - whichfork == XFS_DATA_FORK)); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - flags = 0; - error = 0; - if (ifp->if_bytes) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - - args.tp = tp; - args.mp = ip->i_mount; - args.firstblock = *firstblock; - ASSERT((ifp->if_flags & - (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE); - /* - * Allocate a block. We know we need only one, since the - * file currently fits in an inode. - */ - if (*firstblock == NULLFSBLOCK) { - args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); - args.type = XFS_ALLOCTYPE_START_BNO; - } else { - args.fsbno = *firstblock; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - } - args.total = total; - args.mod = args.minleft = args.alignment = args.wasdel = - args.isfl = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - if ((error = xfs_alloc_vextent(&args))) - goto done; - /* - * Can't fail, the space was reserved. - */ - ASSERT(args.fsbno != NULLFSBLOCK); - ASSERT(args.len == 1); - *firstblock = args.fsbno; - bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); - memcpy((char *)XFS_BUF_PTR(bp), ifp->if_u1.if_data, - ifp->if_bytes); - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); - xfs_bmap_forkoff_reset(args.mp, ip, whichfork); - xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); - xfs_bmap_trace_post_update(fname, "new", ip, 0, whichfork); - XFS_IFORK_NEXT_SET(ip, whichfork, 1); - ip->i_d.di_nblocks = 1; - XFS_TRANS_MOD_DQUOT_BYINO(args.mp, tp, ip, - XFS_TRANS_DQ_BCOUNT, 1L); - flags |= XFS_ILOG_FEXT(whichfork); - } else { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0); - xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork); + if (da_old > da_new) { + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)(da_old - da_new), 0); } - ifp->if_flags &= ~XFS_IFINLINE; - ifp->if_flags |= XFS_IFEXTENTS; - XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); - flags |= XFS_ILOG_CORE; done: *logflagsp = flags; return error; } /* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; -#if XFS_BIG_BLKNOS - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - gotp->br_startblock = 0xffffa5a5; -#endif - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int whichfork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - int rt; /* realtime flag */ - - XFS_STATS_INC(xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, whichfork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - if (unlikely(!rt && !gotp->br_startblock && (*lastxp != NULLEXTNUM))) { - cmn_err(CE_PANIC,"Access to block zero: fs: <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, (long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state); - } - return ep; -} - - -#ifdef XFS_BMAP_TRACE -ktrace_t *xfs_bmap_trace_buf; - -/* - * Add a bmap trace buffer entry. Base routine for the others. - */ -STATIC void -xfs_bmap_trace_addentry( - int opcode, /* operation */ - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(ies) */ - xfs_extnum_t cnt, /* count of entries, 1 or 2 */ - xfs_bmbt_rec_t *r1, /* first record */ - xfs_bmbt_rec_t *r2, /* second record or null */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t tr2; - - ASSERT(cnt == 1 || cnt == 2); - ASSERT(r1 != NULL); - if (cnt == 1) { - ASSERT(r2 == NULL); - r2 = &tr2; - memset(&tr2, 0, sizeof(tr2)); - } else - ASSERT(r2 != NULL); - ktrace_enter(xfs_bmap_trace_buf, - (void *)(__psint_t)(opcode | (whichfork << 16)), - (void *)fname, (void *)desc, (void *)ip, - (void *)(__psint_t)idx, - (void *)(__psint_t)cnt, - (void *)(__psunsigned_t)(ip->i_ino >> 32), - (void *)(__psunsigned_t)(unsigned)ip->i_ino, - (void *)(__psunsigned_t)(r1->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l0), - (void *)(__psunsigned_t)(r1->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l1), - (void *)(__psunsigned_t)(r2->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l0), - (void *)(__psunsigned_t)(r2->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l1) - ); - ASSERT(ip->i_xtrace); - ktrace_enter(ip->i_xtrace, - (void *)(__psint_t)(opcode | (whichfork << 16)), - (void *)fname, (void *)desc, (void *)ip, - (void *)(__psint_t)idx, - (void *)(__psint_t)cnt, - (void *)(__psunsigned_t)(ip->i_ino >> 32), - (void *)(__psunsigned_t)(unsigned)ip->i_ino, - (void *)(__psunsigned_t)(r1->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l0), - (void *)(__psunsigned_t)(r1->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r1->l1), - (void *)(__psunsigned_t)(r2->l0 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l0), - (void *)(__psunsigned_t)(r2->l1 >> 32), - (void *)(__psunsigned_t)(unsigned)(r2->l1) - ); -} - -/* - * Add bmap trace entry prior to a call to xfs_iext_remove. - */ -STATIC void -xfs_bmap_trace_delete( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) deleted */ - xfs_extnum_t cnt, /* count of entries deleted, 1 or 2 */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_DELETE, fname, desc, ip, idx, - cnt, xfs_iext_get_ext(ifp, idx), - cnt == 2 ? xfs_iext_get_ext(ifp, idx + 1) : NULL, - whichfork); -} - -/* - * Add bmap trace entry prior to a call to xfs_iext_insert, or - * reading in the extents list from the disk (in the btree). - */ -STATIC void -xfs_bmap_trace_insert( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry(entries) inserted */ - xfs_extnum_t cnt, /* count of entries inserted, 1 or 2 */ - xfs_bmbt_irec_t *r1, /* inserted record 1 */ - xfs_bmbt_irec_t *r2, /* inserted record 2 or null */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t tr1; /* compressed record 1 */ - xfs_bmbt_rec_t tr2; /* compressed record 2 if needed */ - - xfs_bmbt_set_all(&tr1, r1); - if (cnt == 2) { - ASSERT(r2 != NULL); - xfs_bmbt_set_all(&tr2, r2); - } else { - ASSERT(cnt == 1); - ASSERT(r2 == NULL); - } - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_INSERT, fname, desc, ip, idx, - cnt, &tr1, cnt == 2 ? &tr2 : NULL, whichfork); -} - -/* - * Add bmap trace entry after updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_post_update( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry updated */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_POST_UP, fname, desc, ip, idx, - 1, xfs_iext_get_ext(ifp, idx), NULL, whichfork); -} - -/* - * Add bmap trace entry prior to updating an extent record in place. - */ -STATIC void -xfs_bmap_trace_pre_update( - char *fname, /* function name */ - char *desc, /* operation description */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t idx, /* index of entry to be updated */ - int whichfork) /* data or attr fork */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - xfs_bmap_trace_addentry(XFS_BMAP_KTRACE_PRE_UP, fname, desc, ip, idx, 1, - xfs_iext_get_ext(ifp, idx), NULL, whichfork); -} -#endif /* XFS_BMAP_TRACE */ - -/* - * Compute the worst-case number of indirect blocks that will be used - * for ip's delayed extent of length "len". - */ -STATIC xfs_filblks_t -xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ -{ - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ - - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; - for (level = 0, rval = 0; - level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); - level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - - level - 1; - if (level == 0) - maxrecs = mp->m_bmap_dmxr[1]; - } - return rval; -} - -#if defined(XFS_RW_TRACE) -STATIC void -xfs_bunmap_trace( - xfs_inode_t *ip, - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - inst_t *ra) -{ - if (ip->i_rwtrace == NULL) - return; - ktrace_enter(ip->i_rwtrace, - (void *)(__psint_t)XFS_BUNMAP, - (void *)ip, - (void *)(__psint_t)((ip->i_d.di_size >> 32) & 0xffffffff), - (void *)(__psint_t)(ip->i_d.di_size & 0xffffffff), - (void *)(__psint_t)(((xfs_dfiloff_t)bno >> 32) & 0xffffffff), - (void *)(__psint_t)((xfs_dfiloff_t)bno & 0xffffffff), - (void *)(__psint_t)len, - (void *)(__psint_t)flags, - (void *)(unsigned long)current_cpu(), - (void *)ra, - (void *)0, - (void *)0, - (void *)0, - (void *)0, - (void *)0, - (void *)0); -} -#endif - -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - xfs_inode_t *ip, /* incore inode pointer */ - int size, /* space new attribute needs */ - int rsvd) /* xact may use reserved blks */ -{ - xfs_fsblock_t firstblock; /* 1st block/ag allocated */ - xfs_bmap_free_t flist; /* freed extent records */ - xfs_mount_t *mp; /* mount structure */ - xfs_trans_t *tp; /* transaction pointer */ - unsigned long s; /* spinlock spl value */ - int blks; /* space reservation */ - int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ - int logflags; /* logging flags */ - int error; /* error return value */ - - ASSERT(XFS_IFORK_Q(ip) == 0); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - - mp = ip->i_mount; - ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); - tp = xfs_trans_alloc(mp, XFS_TRANS_ADDAFORK); - blks = XFS_ADDAFORK_SPACE_RES(mp); - if (rsvd) - tp->t_flags |= XFS_TRANS_RESERVE; - if ((error = xfs_trans_reserve(tp, blks, XFS_ADDAFORK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ADDAFORK_LOG_COUNT))) - goto error0; - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, blks, 0, rsvd ? - XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : - XFS_QMOPT_RES_REGBLKS); - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); - return error; - } - if (XFS_IFORK_Q(ip)) - goto error1; - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { - /* - * For inodes coming from pre-6.2 filesystems. - */ - ASSERT(ip->i_d.di_aformat == 0); - ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - } - ASSERT(ip->i_d.di_anextents == 0); - VN_HOLD(XFS_ITOV(ip)); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_DEV: - ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3; - break; - case XFS_DINODE_FMT_UUID: - ip->i_d.di_forkoff = roundup(sizeof(uuid_t), 8) >> 3; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size); - if (!ip->i_d.di_forkoff) - ip->i_d.di_forkoff = mp->m_attroffset >> 3; - else if (mp->m_flags & XFS_MOUNT_ATTR2) - version = 2; - break; - default: - ASSERT(0); - error = XFS_ERROR(EINVAL); - goto error1; - } - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - ip->i_afp->if_flags = XFS_IFEXTENTS; - logflags = 0; - XFS_BMAP_INIT(&flist, &firstblock); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_LOCAL: - error = xfs_bmap_add_attrfork_local(tp, ip, &firstblock, &flist, - &logflags); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_bmap_add_attrfork_extents(tp, ip, &firstblock, - &flist, &logflags); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_bmap_add_attrfork_btree(tp, ip, &firstblock, &flist, - &logflags); - break; - default: - error = 0; - break; - } - if (logflags) - xfs_trans_log_inode(tp, ip, logflags); - if (error) - goto error2; - if (!XFS_SB_VERSION_HASATTR(&mp->m_sb) || - (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2)) { - __int64_t sbfields = 0; - - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASATTR(&mp->m_sb)) { - XFS_SB_VERSION_ADDATTR(&mp->m_sb); - sbfields |= XFS_SB_VERSIONNUM; - } - if (!XFS_SB_VERSION_HASATTR2(&mp->m_sb) && version == 2) { - XFS_SB_VERSION_ADDATTR2(&mp->m_sb); - sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2); - } - if (sbfields) { - XFS_SB_UNLOCK(mp, s); - xfs_mod_sb(tp, sbfields); - } else - XFS_SB_UNLOCK(mp, s); - } - if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) - goto error2; - error = xfs_trans_commit(tp, XFS_TRANS_PERM_LOG_RES, NULL); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; -error2: - xfs_bmap_cancel(&flist); -error1: - ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE)); - xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - return error; -} - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -/* ARGSUSED */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - xfs_mount_t *mp) /* mount point structure */ -{ - xfs_bmap_free_item_t *cur; /* current (next) element */ - xfs_bmap_free_item_t *new; /* new element */ - xfs_bmap_free_item_t *prev; /* previous element */ -#ifdef DEBUG - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!ISNULLSTARTBLOCK(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); - new->xbfi_startblock = bno; - new->xbfi_blockcount = (xfs_extlen_t)len; - for (prev = NULL, cur = flist->xbf_first; - cur != NULL; - prev = cur, cur = cur->xbfi_next) { - if (cur->xbfi_startblock >= bno) - break; - } - if (prev) - prev->xbfi_next = new; - else - flist->xbf_first = new; - new->xbfi_next = cur; - flist->xbf_count++; -} - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - xfs_mount_t *mp, /* file system mount structure */ - int whichfork) /* data or attr fork */ -{ - int level; /* btree level */ - uint maxblocks; /* max blocks at this level */ - uint maxleafents; /* max leaf entries possible */ - int maxrootrecs; /* max records in root block */ - int minleafrecs; /* min records in leaf block */ - int minnoderecs; /* min records in node block */ - int sz; /* root block size */ - - /* - * The maximum number of extents in a file, hence the maximum - * number of leaf entries, is controlled by the type of di_nextents - * (a signed 32-bit number, xfs_extnum_t), or by di_anextents - * (a signed 16-bit number, xfs_aextnum_t). - */ - if (whichfork == XFS_DATA_FORK) { - maxleafents = MAXEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset; - } else { - maxleafents = MAXAEXTNUM; - sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? - XFS_BMDR_SPACE_CALC(MINABTPTRS) : - mp->m_sb.sb_inodesize - mp->m_attroffset; - } - maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); - minleafrecs = mp->m_bmap_dmnr[0]; - minnoderecs = mp->m_bmap_dmnr[1]; - maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs; - for (level = 1; maxblocks > 1; level++) { - if (maxblocks <= maxrootrecs) - maxblocks = 1; - else - maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; - } - mp->m_bm_maxlevels[whichfork] = level; -} - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. We never free any extents in - * the first transaction. This is to allow the caller to make the first - * transaction a synchronous one so that the pointers to the data being - * broken in this transaction will be permanent before the data is actually - * freed. This is necessary to prevent blocks from being reallocated - * and written to before the free and reallocation are actually permanent. - * We do not just make the first transaction synchronous here, because - * there are more efficient ways to gain the same protection in some cases - * (see the file truncation code). - * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. - */ -/*ARGSUSED*/ -int /* error */ -xfs_bmap_finish( - xfs_trans_t **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled ag for allocs */ - int *committed) /* xact committed or not */ -{ - xfs_efd_log_item_t *efd; /* extent free data */ - xfs_efi_log_item_t *efi; /* extent free intention */ - int error; /* error return value */ - xfs_bmap_free_item_t *free; /* free extent item */ - unsigned int logres; /* new log reservation */ - unsigned int logcount; /* new log count */ - xfs_mount_t *mp; /* filesystem mount structure */ - xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ - - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; - return 0; - } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); - for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, - free->xbfi_blockcount); - logres = ntp->t_log_res; - logcount = ntp->t_log_count; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0, NULL); - *tp = ntp; - *committed = 1; - /* - * We have a new transaction, so we should return committed=1, - * even though we're returning an error. - */ - if (error) { - return error; - } - if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES, - logcount))) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); - for (free = flist->xbf_first; free != NULL; free = next) { - next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, - free->xbfi_blockcount))) { - /* - * The bmap free list will be cleaned up at a - * higher level. The EFI will be canceled when - * this transaction is aborted. - * Need to force shutdown here to make sure it - * happens, since this transaction may not be - * dirty yet. - */ - mp = ntp->t_mountp; - if (!XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, - (error == EFSCORRUPTED) ? - SHUTDOWN_CORRUPT_INCORE : - SHUTDOWN_META_IO_ERROR); - return error; - } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, - free->xbfi_blockcount); - xfs_bmap_del_free(flist, NULL, free); - } - return 0; -} - -/* - * Free up any items left in the list. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist) /* list of bmap_free_items */ -{ - xfs_bmap_free_item_t *free; /* free list item */ - xfs_bmap_free_item_t *next; - - if (flist->xbf_count == 0) - return; - ASSERT(flist->xbf_first != NULL); - for (free = flist->xbf_first; free; free = next) { - next = free->xbfi_next; - xfs_bmap_del_free(flist, NULL, free); - } - ASSERT(flist->xbf_count == 0); -} - -/* - * Returns the file-relative block number of the first unused block(s) - * in the file with at least "len" logically contiguous blocks free. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - * Return 0 if the file is currently local (in-inode). - */ -int /* error */ -xfs_bmap_first_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *first_unused, /* unused block */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* pointer to an extent entry */ - int error; /* error return value */ - int idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_fileoff_t lastaddr; /* last block number seen */ - xfs_fileoff_t lowest; /* lowest useful block */ - xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ - xfs_extnum_t nextents; /* number of extent entries */ - - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || - XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *first_unused = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); - /* - * See if the hole before this extent will work. - */ - if (off >= lowest + len && off - max >= len) { - *first_unused = max; - return 0; - } - lastaddr = off + xfs_bmbt_get_blockcount(ep); - max = XFS_FILEOFF_MAX(lastaddr, lowest); - } - *first_unused = max; - return 0; -} - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; - } - /* - * Otherwise *last_block is already the right answer. - */ - return 0; -} - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent records. - * Returns 0 for local files, as they do not have extent records. - */ -int /* error */ -xfs_bmap_last_offset( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EIO); - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - *last_block = 0; - return 0; - } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (!nextents) { - *last_block = 0; - return 0; - } - ep = xfs_iext_get_ext(ifp, nextents - 1); - *last_block = xfs_bmbt_get_startoff(ep) + xfs_bmbt_get_blockcount(ep); - return 0; -} - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int /* 1=>1 block, 0=>otherwise */ -xfs_bmap_one_block( - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* ptr to fork's extent */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int rval; /* return value */ - xfs_bmbt_irec_t s; /* internal version of extent */ - -#ifndef DEBUG - if (whichfork == XFS_DATA_FORK) - return ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize; -#endif /* !DEBUG */ - if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) - return 0; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_get_all(ep, &s); - rval = s.br_startoff == 0 && s.br_blockcount == 1; - if (rval && whichfork == XFS_DATA_FORK) - ASSERT(ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize); - return rval; -} - -/* - * Read in the extents to if_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. If the file system cannot contain unwritten - * extents, the records are checked for no "state" flags. - */ -int /* error */ -xfs_bmap_read_extents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_block_t *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_bmap_read_extents"; -#endif - xfs_extnum_t i, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ - /* REFERENCED */ - xfs_extnum_t room; /* number of entries there's room for */ - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - return error; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, level), - error0); - if (level == 0) - break; - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO( - XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), - error0); - bno = INT_GET(*pp, ARCH_CONVERT); - xfs_trans_brelse(tp, bp); - } - /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - i = 0; - /* - * Loop over all leaf nodes. Copy information to the extent records. - */ - for (;;) { - xfs_bmbt_rec_t *frp, *trp; - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; - xfs_extnum_t start; - - - num_recs = be16_to_cpu(block->bb_numrecs); - if (unlikely(i + num_recs > room)) { - ASSERT(i + num_recs <= room); - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, (btree extents).", - (unsigned long long) ip->i_ino); - XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, 0), - error0); - /* - * Read-ahead the next leaf block, if any. - */ - nextbno = be64_to_cpu(block->bb_rightsib); - if (nextbno != NULLFSBLOCK) - xfs_btree_reada_bufl(mp, nextbno, 1); - /* - * Copy records into the extent records. - */ - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); - start = i; - for (j = 0; j < num_recs; j++, i++, frp++) { - trp = xfs_iext_get_ext(ifp, i); - trp->l0 = INT_GET(frp->l0, ARCH_CONVERT); - trp->l1 = INT_GET(frp->l1, ARCH_CONVERT); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { - XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - goto error0; - } - } - xfs_trans_brelse(tp, bp); - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - return error; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); - xfs_bmap_trace_exlist(fname, ip, i, whichfork); - return 0; -error0: - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); -} - -#ifdef XFS_BMAP_TRACE -/* - * Add bmap trace insert entries for all the contents of the extent records. - */ -void -xfs_bmap_trace_exlist( - char *fname, /* function name */ - xfs_inode_t *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* current extent record */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t s; /* file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - for (idx = 0; idx < cnt; idx++) { - ep = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(ep, &s); - xfs_bmap_trace_insert(fname, "exlist", ip, idx, 1, &s, NULL, - whichfork); - } -} -#endif - -#ifdef DEBUG -/* - * Validate that the bmbt_irecs being returned from bmapi are valid - * given the callers original parameters. Specifically check the - * ranges of the returned irecs to ensure that they only extent beyond - * the given parameters if the XFS_BMAPI_ENTIRE flag was set. - */ -STATIC void -xfs_bmap_validate_ret( - xfs_fileoff_t bno, - xfs_filblks_t len, - int flags, - xfs_bmbt_irec_t *mval, - int nmap, - int ret_nmap) -{ - int i; /* index to map values */ - - ASSERT(ret_nmap <= nmap); - - for (i = 0; i < ret_nmap; i++) { - ASSERT(mval[i].br_blockcount > 0); - if (!(flags & XFS_BMAPI_ENTIRE)) { - ASSERT(mval[i].br_startoff >= bno); - ASSERT(mval[i].br_blockcount <= len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount <= - bno + len); - } else { - ASSERT(mval[i].br_startoff < bno + len); - ASSERT(mval[i].br_startoff + mval[i].br_blockcount > - bno); - } - ASSERT(i == 0 || - mval[i - 1].br_startoff + mval[i - 1].br_blockcount == - mval[i].br_startoff); - if ((flags & XFS_BMAPI_WRITE) && !(flags & XFS_BMAPI_DELAY)) - ASSERT(mval[i].br_startblock != DELAYSTARTBLOCK && - mval[i].br_startblock != HOLESTARTBLOCK); - ASSERT(mval[i].br_state == XFS_EXT_NORM || - mval[i].br_state == XFS_EXT_UNWRITTEN); - } -} -#endif /* DEBUG */ - - -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - xfs_bmbt_irec_t *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta) /* o: change made to incore extents */ -{ - xfs_fsblock_t abno; /* allocated block number */ - xfs_extlen_t alen; /* allocated extent length */ - xfs_fileoff_t aoff; /* allocated file offset */ - xfs_bmalloca_t bma; /* args for xfs_bmap_alloc */ - xfs_btree_cur_t *cur; /* bmap btree cursor */ - xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* we've hit the end of extents */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extlen_t indlen; /* indirect blocks length */ - xfs_extnum_t lastx; /* last useful extent number */ - int logflags; /* flags for transaction logging */ - xfs_extlen_t minleft; /* min blocks left after allocation */ - xfs_extlen_t minlen; /* min allocation size */ - xfs_mount_t *mp; /* xfs mount structure */ - int n; /* current extent index */ - int nallocs; /* number of extents alloc\'d */ - xfs_extnum_t nextents; /* number of extents in file */ - xfs_fileoff_t obno; /* old block number (offset) */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - int tmp_logflags; /* temp flags holder */ - int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ - char wr; /* this is a write request */ - char rt; /* this is a realtime file */ -#ifdef DEBUG - xfs_fileoff_t orig_bno; /* original block number value */ - int orig_flags; /* original flags arg value */ - xfs_filblks_t orig_len; /* original value of len arg */ - xfs_bmbt_irec_t *orig_mval; /* original value of mval */ - int orig_nmap; /* original value of *nmap */ - - orig_bno = bno; - orig_len = len; - orig_flags = flags; - orig_mval = mval; - orig_nmap = *nmap; -#endif - ASSERT(*nmap >= 1); - ASSERT(*nmap <= XFS_BMAP_MAX_NMAP || !(flags & XFS_BMAPI_WRITE)); - whichfork = (flags & XFS_BMAPI_ATTRFORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - mp = ip->i_mount; - if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL), - mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { - XFS_ERROR_REPORT("xfs_bmapi", XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - rt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - if ((wr = (flags & XFS_BMAPI_WRITE)) != 0) - XFS_STATS_INC(xs_blk_mapw); - else - XFS_STATS_INC(xs_blk_mapr); - /* - * IGSTATE flag is used to combine extents which - * differ only due to the state of the extents. - * This technique is used from xfs_getbmap() - * when the caller does not wish to see the - * separation (which is the default). - * - * This technique is also used when writing a - * buffer which has been partially written, - * (usually by being flushed during a chunkread), - * to ensure one write takes place. This also - * prevents a change in the xfs inode extents at - * this time, intentionally. This change occurs - * on completion of the write operation, in - * xfs_strat_comp(), where the xfs_bmapi() call - * is transactioned, and the extents combined. - */ - if ((flags & XFS_BMAPI_IGSTATE) && wr) /* if writing unwritten space */ - wr = 0; /* no allocations are allowed */ - ASSERT(wr || !(flags & XFS_BMAPI_DELAY)); - logflags = 0; - nallocs = 0; - cur = NULL; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - ASSERT(wr && tp); - if ((error = xfs_bmap_local_to_extents(tp, ip, - firstblock, total, &logflags, whichfork))) - goto error0; - } - if (wr && *firstblock == NULLFSBLOCK) { - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) - minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1; - else - minleft = 1; - } else - minleft = 0; - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - goto error0; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - n = 0; - end = bno + len; - obno = bno; - bma.ip = NULL; - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } - while (bno < end && n < *nmap) { - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof && !wr) - got.br_startoff = end; - inhole = eof || got.br_startoff > bno; - wasdelay = wr && !inhole && !(flags & XFS_BMAPI_DELAY) && - ISNULLSTARTBLOCK(got.br_startblock); - /* - * First, deal with the hole before the allocated space - * that we found, if any. - */ - if (wr && (inhole || wasdelay)) { - /* - * For the wasdelay case, we could also just - * allocate the stuff asked for in this bmap call - * but that wouldn't be as good. - */ - if (wasdelay && !(flags & XFS_BMAPI_EXACT)) { - alen = (xfs_extlen_t)got.br_blockcount; - aoff = got.br_startoff; - if (lastx != NULLEXTNUM && lastx) { - ep = xfs_iext_get_ext(ifp, lastx - 1); - xfs_bmbt_get_all(ep, &prev); - } - } else if (wasdelay) { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, - (got.br_startoff + - got.br_blockcount) - bno); - aoff = bno; - } else { - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(len, MAXEXTLEN); - if (!eof) - alen = (xfs_extlen_t) - XFS_FILBLKS_MIN(alen, - got.br_startoff - bno); - aoff = bno; - } - minlen = (flags & XFS_BMAPI_CONTIG) ? alen : 1; - if (flags & XFS_BMAPI_DELAY) { - xfs_extlen_t extsz; - - /* Figure out the extent size, adjust alen */ - if (rt) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - if (extsz) { - error = xfs_bmap_extsize_align(mp, - &got, &prev, extsz, - rt, eof, - flags&XFS_BMAPI_DELAY, - flags&XFS_BMAPI_CONVERT, - &aoff, &alen); - ASSERT(!error); - } - - if (rt) - extsz = alen / mp->m_sb.sb_rextsize; - - /* - * Make a transaction-less quota reservation for - * delayed allocation blocks. This number gets - * adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS( - mp, NULL, ip, (long)alen, 0, - rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS))) { - if (n == 0) { - *nmap = 0; - ASSERT(cur == NULL); - return error; - } - break; - } - - /* - * Split changing sb for alen and indlen since - * they could be coming from different places. - */ - indlen = (xfs_extlen_t) - xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - if (rt) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - -(extsz), (flags & - XFS_BMAPI_RSVBLOCKS)); - } else { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FDBLOCKS, - -(alen), (flags & - XFS_BMAPI_RSVBLOCKS)); - } - if (!error) { - error = xfs_mod_incore_sb(mp, - XFS_SBS_FDBLOCKS, - -(indlen), (flags & - XFS_BMAPI_RSVBLOCKS)); - if (error && rt) - xfs_mod_incore_sb(mp, - XFS_SBS_FREXTENTS, - extsz, (flags & - XFS_BMAPI_RSVBLOCKS)); - else if (error) - xfs_mod_incore_sb(mp, - XFS_SBS_FDBLOCKS, - alen, (flags & - XFS_BMAPI_RSVBLOCKS)); - } - - if (error) { - if (XFS_IS_QUOTA_ON(mp)) - /* unreserve the blocks now */ - (void) - XFS_TRANS_UNRESERVE_QUOTA_NBLKS( - mp, NULL, ip, - (long)alen, 0, rt ? - XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS); - break; - } - - ip->i_delayed_blks += alen; - abno = NULLSTARTBLOCK(indlen); - } else { - /* - * If first time, allocate and fill in - * once-only bma fields. - */ - if (bma.ip == NULL) { - bma.tp = tp; - bma.ip = ip; - bma.prevp = &prev; - bma.gotp = &got; - bma.total = total; - bma.userdata = 0; - } - /* Indicate if this is the first user data - * in the file, or just any user data. - */ - if (!(flags & XFS_BMAPI_METADATA)) { - bma.userdata = (aoff == 0) ? - XFS_ALLOC_INITIAL_USER_DATA : - XFS_ALLOC_USERDATA; - } - /* - * Fill in changeable bma fields. - */ - bma.eof = eof; - bma.firstblock = *firstblock; - bma.alen = alen; - bma.off = aoff; - bma.conv = !!(flags & XFS_BMAPI_CONVERT); - bma.wasdel = wasdelay; - bma.minlen = minlen; - bma.low = flist->xbf_low; - bma.minleft = minleft; - /* - * Only want to do the alignment at the - * eof if it is userdata and allocation length - * is larger than a stripe unit. - */ - if (mp->m_dalign && alen >= mp->m_dalign && - (!(flags & XFS_BMAPI_METADATA)) && - (whichfork == XFS_DATA_FORK)) { - if ((error = xfs_bmap_isaeof(ip, aoff, - whichfork, &bma.aeof))) - goto error0; - } else - bma.aeof = 0; - /* - * Call allocator. - */ - if ((error = xfs_bmap_alloc(&bma))) - goto error0; - /* - * Copy out result fields. - */ - abno = bma.rval; - if ((flist->xbf_low = bma.low)) - minleft = 0; - alen = bma.alen; - aoff = bma.off; - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, bma.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, bma.firstblock))); - *firstblock = bma.firstblock; - if (cur) - cur->bc_private.b.firstblock = - *firstblock; - if (abno == NULLFSBLOCK) - break; - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_btree_init_cursor(mp, - tp, NULL, 0, XFS_BTNUM_BMAP, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - /* - * Bump the number of extents we've allocated - * in this call. - */ - nallocs++; - } - if (cur) - cur->bc_private.b.flags = - wasdelay ? XFS_BTCUR_BPRV_WASDEL : 0; - got.br_startoff = aoff; - got.br_startblock = abno; - got.br_blockcount = alen; - got.br_state = XFS_EXT_NORM; /* assume normal */ - /* - * Determine state of extent, and the filesystem. - * A wasdelay extent has been initialized, so - * shouldn't be flagged as unwritten. - */ - if (wr && XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { - if (!wasdelay && (flags & XFS_BMAPI_PREALLOC)) - got.br_state = XFS_EXT_UNWRITTEN; - } - error = xfs_bmap_add_extent(ip, lastx, &cur, &got, - firstblock, flist, &tmp_logflags, delta, - whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); - logflags |= tmp_logflags; - if (error) - goto error0; - lastx = ifp->if_lastex; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - ASSERT(got.br_startoff <= aoff); - ASSERT(got.br_startoff + got.br_blockcount >= - aoff + alen); -#ifdef DEBUG - if (flags & XFS_BMAPI_DELAY) { - ASSERT(ISNULLSTARTBLOCK(got.br_startblock)); - ASSERT(STARTBLOCKVAL(got.br_startblock) > 0); - } - ASSERT(got.br_state == XFS_EXT_NORM || - got.br_state == XFS_EXT_UNWRITTEN); -#endif - /* - * Fall down into the found allocated space case. - */ - } else if (inhole) { - /* - * Reading in a hole. - */ - mval->br_startoff = bno; - mval->br_startblock = HOLESTARTBLOCK; - mval->br_blockcount = - XFS_FILBLKS_MIN(len, got.br_startoff - bno); - mval->br_state = XFS_EXT_NORM; - bno += mval->br_blockcount; - len -= mval->br_blockcount; - mval++; - n++; - continue; - } - /* - * Then deal with the allocated space we found. - */ - ASSERT(ep != NULL); - if (!(flags & XFS_BMAPI_ENTIRE) && - (got.br_startoff + got.br_blockcount > obno)) { - if (obno > bno) - bno = obno; - ASSERT((bno >= obno) || (n == 0)); - ASSERT(bno < end); - mval->br_startoff = bno; - if (ISNULLSTARTBLOCK(got.br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } else - mval->br_startblock = - got.br_startblock + - (bno - got.br_startoff); - /* - * Return the minimum of what we got and what we - * asked for for the length. We can use the len - * variable here because it is modified below - * and we could have been there before coming - * here if the first part of the allocation - * didn't overlap what was asked for. - */ - mval->br_blockcount = - XFS_FILBLKS_MIN(end - bno, got.br_blockcount - - (bno - got.br_startoff)); - mval->br_state = got.br_state; - ASSERT(mval->br_blockcount <= len); - } else { - *mval = got; - if (ISNULLSTARTBLOCK(mval->br_startblock)) { - ASSERT(!wr || (flags & XFS_BMAPI_DELAY)); - mval->br_startblock = DELAYSTARTBLOCK; - } - } - - /* - * Check if writing previously allocated but - * unwritten extents. - */ - if (wr && mval->br_state == XFS_EXT_UNWRITTEN && - ((flags & (XFS_BMAPI_PREALLOC|XFS_BMAPI_DELAY)) == 0)) { - /* - * Modify (by adding) the state flag, if writing. - */ - ASSERT(mval->br_blockcount <= len); - if ((ifp->if_flags & XFS_IFBROOT) && !cur) { - cur = xfs_btree_init_cursor(mp, - tp, NULL, 0, XFS_BTNUM_BMAP, - ip, whichfork); - cur->bc_private.b.firstblock = - *firstblock; - cur->bc_private.b.flist = flist; - } - mval->br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent(ip, lastx, &cur, mval, - firstblock, flist, &tmp_logflags, delta, - whichfork, (flags & XFS_BMAPI_RSVBLOCKS)); - logflags |= tmp_logflags; - if (error) - goto error0; - lastx = ifp->if_lastex; - ep = xfs_iext_get_ext(ifp, lastx); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmbt_get_all(ep, &got); - /* - * We may have combined previously unwritten - * space with written space, so generate - * another request. - */ - if (mval->br_blockcount < len) - continue; - } - - ASSERT((flags & XFS_BMAPI_ENTIRE) || - ((mval->br_startoff + mval->br_blockcount) <= end)); - ASSERT((flags & XFS_BMAPI_ENTIRE) || - (mval->br_blockcount <= len) || - (mval->br_startoff < obno)); - bno = mval->br_startoff + mval->br_blockcount; - len = end - bno; - if (n > 0 && mval->br_startoff == mval[-1].br_startoff) { - ASSERT(mval->br_startblock == mval[-1].br_startblock); - ASSERT(mval->br_blockcount > mval[-1].br_blockcount); - ASSERT(mval->br_state == mval[-1].br_state); - mval[-1].br_blockcount = mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (n > 0 && mval->br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != DELAYSTARTBLOCK && - mval[-1].br_startblock != HOLESTARTBLOCK && - mval->br_startblock == - mval[-1].br_startblock + mval[-1].br_blockcount && - ((flags & XFS_BMAPI_IGSTATE) || - mval[-1].br_state == mval->br_state)) { - ASSERT(mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount); - mval[-1].br_blockcount += mval->br_blockcount; - } else if (n > 0 && - mval->br_startblock == DELAYSTARTBLOCK && - mval[-1].br_startblock == DELAYSTARTBLOCK && - mval->br_startoff == - mval[-1].br_startoff + mval[-1].br_blockcount) { - mval[-1].br_blockcount += mval->br_blockcount; - mval[-1].br_state = mval->br_state; - } else if (!((n == 0) && - ((mval->br_startoff + mval->br_blockcount) <= - obno))) { - mval++; - n++; - } - /* - * If we're done, stop now. Stop when we've allocated - * XFS_BMAP_MAX_NMAP extents no matter what. Otherwise - * the transaction may get too big. - */ - if (bno >= end || n >= *nmap || nallocs >= *nmap) - break; - /* - * Else go on to the next record. - */ - ep = xfs_iext_get_ext(ifp, ++lastx); - if (lastx >= nextents) { - eof = 1; - prev = got; - } else - xfs_bmbt_get_all(ep, &got); - } - ifp->if_lastex = lastx; - *nmap = n; - /* - * Transform from btree to extents, give it cur. - */ - if (tp && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { - ASSERT(wr && cur); - error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, whichfork); - logflags |= tmp_logflags; - if (error) - goto error0; - } - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); - ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); - error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } -error0: - /* - * Log everything. Do this after conversion, there's no point in - * logging the extent records if we've converted to btree format. - */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); - /* - * Log whatever the flags say, even if error. Otherwise we might miss - * detecting a case where the data is changed, there's an error, - * and it's not logged so we don't shutdown when we should. - */ - if (logflags) { - ASSERT(tp && wr); - xfs_trans_log_inode(tp, ip, logflags); - } - if (cur) { - if (!error) { - ASSERT(*firstblock == NULLFSBLOCK || - XFS_FSB_TO_AGNO(mp, *firstblock) == - XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock) || - (flist->xbf_low && - XFS_FSB_TO_AGNO(mp, *firstblock) < - XFS_FSB_TO_AGNO(mp, - cur->bc_private.b.firstblock))); - *firstblock = cur->bc_private.b.firstblock; - } - xfs_btree_del_cursor(cur, - error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - } - if (!error) - xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, - orig_nmap, *nmap); - return error; -} - -/* - * Map file blocks to filesystem blocks, simple version. - * One block (extent) only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno) /* starting file offs. mapped */ -{ - int eof; /* we've hit the end of extents */ - int error; /* error return */ - xfs_bmbt_irec_t got; /* current file extent record */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last useful extent number */ - xfs_bmbt_irec_t prev; /* previous file extent record */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - if (unlikely( - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_bmapi_single", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - XFS_STATS_INC(xs_blk_mapr); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - (void)xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - /* - * Reading past eof, act as though there's a hole - * up to end. - */ - if (eof || got.br_startoff > bno) { - *fsb = NULLFSBLOCK; - return 0; - } - ASSERT(!ISNULLSTARTBLOCK(got.br_startblock)); - ASSERT(bno < got.br_startoff + got.br_blockcount); - *fsb = got.br_startblock + (bno - got.br_startoff); - ifp->if_lastex = lastx; - return 0; -} - -/* * Unmap (remove) blocks from a file. * If nexts is nonzero then the number of extents to remove is limited to * that value. If not all extents in the block range can be removed then @@ -5373,14 +5043,12 @@ xfs_bunmapi( xfs_fsblock_t *firstblock, /* first allocated block controls a.g. for allocs */ xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ int *done) /* set if not done yet */ { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ int eof; /* is deleting at eof */ - xfs_bmbt_rec_t *ep; /* extent record pointer */ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5396,10 +5064,10 @@ xfs_bunmapi( int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ int whichfork; /* data or attribute fork */ - int rsvd; /* OK to allocate reserved blocks */ xfs_fsblock_t sum; - xfs_bunmap_trace(ip, bno, len, flags, (inst_t *)__return_address); + trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); + whichfork = (flags & XFS_BMAPI_ATTRFORK) ? XFS_ATTR_FORK : XFS_DATA_FORK; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -5413,11 +5081,11 @@ xfs_bunmapi( mp = ip->i_mount; if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); - rsvd = (flags & XFS_BMAPI_RSVBLOCKS) != 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(len > 0); ASSERT(nexts >= 0); - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; @@ -5432,10 +5100,7 @@ xfs_bunmapi( bno = start + len - 1; ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); - if (delta) { - delta->xed_startoff = NULLFILEOFF; - delta->xed_blockcount = 0; - } + /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... @@ -5448,13 +5113,21 @@ xfs_bunmapi( logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); - cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip, - whichfork); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; cur->bc_private.b.flist = flist; cur->bc_private.b.flags = 0; } else cur = NULL; + + if (isrt) { + /* + * Synchronize by locking the bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); + } + extno = 0; while (bno != (xfs_fileoff_t)-1 && bno >= start && lastx >= 0 && (nexts == 0 || extno < nexts)) { @@ -5482,7 +5155,7 @@ xfs_bunmapi( */ ASSERT(ep != NULL); del = got; - wasdel = ISNULLSTARTBLOCK(del.br_startblock); + wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; @@ -5502,7 +5175,7 @@ xfs_bunmapi( * get rid of part of a realtime extent. */ if (del.br_state == XFS_EXT_UNWRITTEN || - !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { /* * This piece is unwritten, or we're not * using unwritten extents. Skip over it. @@ -5533,9 +5206,9 @@ xfs_bunmapi( del.br_blockcount = mod; } del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx, &cur, &del, - firstblock, flist, &logflags, delta, - XFS_DATA_FORK, 0); + error = xfs_bmap_add_extent_unwritten_real(tp, ip, + &lastx, &cur, &del, firstblock, flist, + &logflags); if (error) goto error0; goto nodelete; @@ -5554,16 +5227,19 @@ xfs_bunmapi( } else if ((del.br_startoff == start && (del.br_state == XFS_EXT_UNWRITTEN || xfs_trans_get_block_res(tp) == 0)) || - !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { + !xfs_sb_version_hasextflgbit(&mp->m_sb)) { /* * Can't make it unwritten. There isn't * a full extent here so just skip it. */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (bno < got.br_startoff) { - if (--lastx >= 0) - xfs_bmbt_get_all(--ep, &got); + if (got.br_startoff > bno) { + if (--lastx >= 0) { + ep = xfs_iext_get_ext(ifp, + lastx); + xfs_bmbt_get_all(ep, &got); + } } continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { @@ -5577,7 +5253,7 @@ xfs_bunmapi( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), &prev); ASSERT(prev.br_state == XFS_EXT_NORM); - ASSERT(!ISNULLSTARTBLOCK(prev.br_startblock)); + ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == prev.br_startblock + prev.br_blockcount); if (prev.br_startoff < start) { @@ -5587,25 +5263,26 @@ xfs_bunmapi( prev.br_startoff = start; } prev.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx - 1, &cur, - &prev, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + lastx--; + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &prev, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } else { ASSERT(del.br_state == XFS_EXT_NORM); del.br_state = XFS_EXT_UNWRITTEN; - error = xfs_bmap_add_extent(ip, lastx, &cur, - &del, firstblock, flist, &logflags, - delta, XFS_DATA_FORK, 0); + error = xfs_bmap_add_extent_unwritten_real(tp, + ip, &lastx, &cur, &del, + firstblock, flist, &logflags); if (error) goto error0; goto nodelete; } } if (wasdel) { - ASSERT(STARTBLOCKVAL(del.br_startblock) > 0); + ASSERT(startblockval(del.br_startblock) > 0); /* Update realtime/data freespace, unreserve quota */ if (isrt) { xfs_filblks_t rtexts; @@ -5613,15 +5290,15 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int)rtexts, rsvd); - (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, - NULL, ip, -((long)del.br_blockcount), 0, + (int64_t)rtexts, 0); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_mod_incore_sb(mp, XFS_SBS_FDBLOCKS, - (int)del.br_blockcount, rsvd); - (void)XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, - NULL, ip, -((long)del.br_blockcount), 0, + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + (int64_t)del.br_blockcount, 0); + (void)xfs_trans_reserve_quota_nblks(NULL, + ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); } ip->i_delayed_blks -= del.br_blockcount; @@ -5643,46 +5320,43 @@ xfs_bunmapi( */ if (!wasdel && xfs_trans_get_block_res(tp) == 0 && XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && + XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */ + XFS_IFORK_MAXEXT(ip, whichfork) && del.br_startoff > got.br_startoff && del.br_startoff + del.br_blockcount < got.br_startoff + got.br_blockcount) { error = XFS_ERROR(ENOSPC); goto error0; } - error = xfs_bmap_del_extent(ip, tp, lastx, flist, cur, &del, - &tmp_logflags, delta, whichfork, rsvd); + error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, + &tmp_logflags, whichfork); logflags |= tmp_logflags; if (error) goto error0; bno = del.br_startoff - 1; nodelete: - lastx = ifp->if_lastex; /* * If not done go on to the next (previous) record. - * Reset ep in case the extents array was re-alloced. */ - ep = xfs_iext_get_ext(ifp, lastx); if (bno != (xfs_fileoff_t)-1 && bno >= start) { - if (lastx >= XFS_IFORK_NEXTENTS(ip, whichfork) || - xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, lastx); - } - if (lastx >= 0) + if (lastx >= 0) { + ep = xfs_iext_get_ext(ifp, lastx); + if (xfs_bmbt_get_startoff(ep) > bno) { + if (--lastx >= 0) + ep = xfs_iext_get_ext(ifp, + lastx); + } xfs_bmbt_get_all(ep, &got); + } extno++; } } - ifp->if_lastex = lastx; *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); + /* * Convert to a btree if necessary. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) { + if (xfs_bmap_needs_btree(ip, whichfork)) { ASSERT(cur == NULL); error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, &cur, 0, &tmp_logflags, whichfork); @@ -5693,8 +5367,7 @@ nodelete: /* * transform from btree to extents, give it cur */ - else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && - XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) { + else if (xfs_bmap_wants_extents(ip, whichfork)) { ASSERT(cur != NULL); error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, whichfork); @@ -5705,28 +5378,18 @@ nodelete: /* * transform from extents to local? */ - ASSERT(ifp->if_ext_max == - XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t)); error = 0; - if (delta && delta->xed_startoff != NULLFILEOFF) { - /* A change was actually made. - * Note that delta->xed_blockount is an offset at this - * point and needs to be converted to a block count. - */ - ASSERT(delta->xed_blockcount > delta->xed_startoff); - delta->xed_blockcount -= delta->xed_startoff; - } error0: /* * Log everything. Do this after conversion, there's no point in * logging the extent records if we've converted to btree format. */ - if ((logflags & XFS_ILOG_FEXT(whichfork)) && + if ((logflags & xfs_ilog_fext(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) - logflags &= ~XFS_ILOG_FEXT(whichfork); - else if ((logflags & XFS_ILOG_FBROOT(whichfork)) && + logflags &= ~xfs_ilog_fext(whichfork); + else if ((logflags & xfs_ilog_fbroot(whichfork)) && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) - logflags &= ~XFS_ILOG_FBROOT(whichfork); + logflags &= ~xfs_ilog_fbroot(whichfork); /* * Log inode even in the error case, if the transaction * is dirty we'll need to shut down the filesystem. @@ -5745,788 +5408,199 @@ error0: } /* - * Fcntl interface to xfs_bmapi. + * Shift extent records to the left to cover a hole. + * + * The maximum number of extents to be shifted in a single operation + * is @num_exts, and @current_ext keeps track of the current extent + * index we have shifted. @offset_shift_fsb is the length by which each + * extent is shifted. If there is no hole to shift the extents + * into, this will be considered invalid operation and we abort immediately. */ -int /* error code */ -xfs_getbmap( - bhv_desc_t *bdp, /* XFS behavior descriptor*/ - struct getbmap *bmv, /* user bmap structure */ - void __user *ap, /* pointer to user's array */ - int interface) /* interface flags */ +int +xfs_bmap_shift_extents( + struct xfs_trans *tp, + struct xfs_inode *ip, + int *done, + xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, + xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, + int num_exts) { - __int64_t bmvend; /* last block requested */ - int error; /* return value */ - __int64_t fixlen; /* length for -1 case */ - int i; /* extent number */ - xfs_inode_t *ip; /* xfs incore inode pointer */ - bhv_vnode_t *vp; /* corresponding vnode */ - int lock; /* lock state */ - xfs_bmbt_irec_t *map; /* buffer for user's data */ - xfs_mount_t *mp; /* file system mount point */ - int nex; /* # of user extents can do */ - int nexleft; /* # of user extents left */ - int subnex; /* # of bmapi's can do */ - int nmap; /* number of map entries */ - struct getbmap out; /* output structure */ - int whichfork; /* data or attr fork */ - int prealloced; /* this is a file with - * preallocated data space */ - int sh_unwritten; /* true, if unwritten */ - /* extents listed separately */ - int bmapi_flags; /* flags for xfs_bmapi */ - __int32_t oflags; /* getbmapx bmv_oflags field */ - - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; - sh_unwritten = (interface & BMV_IF_PREALLOC) != 0; - - /* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not - * generate a DMAPI read event. Otherwise, if the DM_EVENT_READ - * bit is set for the file, generate a read event in order - * that the DMAPI application may do its thing before we return - * the extents. Usually this means restoring user file data to - * regions of the file that look like holes. - * - * The "old behavior" (from XFS_IOC_GETBMAP) is to not specify - * BMV_IF_NO_DMAPI_READ so that read events are generated. - * If this were not true, callers of ioctl( XFS_IOC_GETBMAP ) - * could misinterpret holes in a DMAPI file as true holes, - * when in fact they may represent offline user data. - */ - if ( (interface & BMV_IF_NO_DMAPI_READ) == 0 - && DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) - && whichfork == XFS_DATA_FORK) { - - error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); - if (error) - return XFS_ERROR(error); - } - - if (whichfork == XFS_ATTR_FORK) { - if (XFS_IFORK_Q(ip)) { - if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && - ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - } else if (unlikely( - ip->i_d.di_aformat != 0 && - ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { - XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - } else if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && - ip->i_d.di_format != XFS_DINODE_FMT_BTREE && - ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) - return XFS_ERROR(EINVAL); - if (whichfork == XFS_DATA_FORK) { - if ((ip->i_d.di_extsize && (ip->i_d.di_flags & - (XFS_DIFLAG_REALTIME|XFS_DIFLAG_EXTSIZE))) || - ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ - prealloced = 1; - fixlen = XFS_MAXIOFFSET(mp); - } else { - prealloced = 0; - fixlen = ip->i_d.di_size; - } - } else { - prealloced = 0; - fixlen = 1LL << 32; - } - - if (bmv->bmv_length == -1) { - fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); - bmv->bmv_length = MAX( (__int64_t)(fixlen - bmv->bmv_offset), - (__int64_t)0); - } else if (bmv->bmv_length < 0) - return XFS_ERROR(EINVAL); - if (bmv->bmv_length == 0) { - bmv->bmv_entries = 0; - return 0; - } - nex = bmv->bmv_count - 1; - if (nex <= 0) - return XFS_ERROR(EINVAL); - bmvend = bmv->bmv_offset + bmv->bmv_length; - - xfs_ilock(ip, XFS_IOLOCK_SHARED); - - if (whichfork == XFS_DATA_FORK && ip->i_delayed_blks) { - /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ - error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, 0, FI_REMAPF); - } - - ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); - - lock = xfs_ilock_map_shared(ip); - - /* - * Don't let nex be bigger than the number of extents - * we can have assuming alternating holes and real extents. - */ - if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) - nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; - - bmapi_flags = XFS_BMAPI_AFLAG(whichfork) | - ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE); - - /* - * Allocate enough space to handle "subnex" maps at a time. - */ - subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_SLEEP); + struct xfs_btree_cur *cur; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_extnum_t nexts = 0; + xfs_fileoff_t startoff; + int error = 0; + int i; + int whichfork = XFS_DATA_FORK; + int logflags; + xfs_filblks_t blockcount = 0; + int total_extents; - bmv->bmv_entries = 0; - - if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) { - error = 0; - goto unlock_and_return; + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_shift_extents", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); } - nexleft = nex; - - do { - nmap = (nexleft > subnex) ? subnex : nexleft; - error = xfs_bmapi(NULL, ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), - XFS_BB_TO_FSB(mp, bmv->bmv_length), - bmapi_flags, NULL, 0, map, &nmap, - NULL, NULL); - if (error) - goto unlock_and_return; - ASSERT(nmap <= subnex); - - for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { - nexleft--; - oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ? - BMV_OF_PREALLOC : 0; - out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff); - out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount); - ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); - if (map[i].br_startblock == HOLESTARTBLOCK && - ((prealloced && out.bmv_offset + out.bmv_length == bmvend) || - whichfork == XFS_ATTR_FORK )) { - /* - * came to hole at end of file or the end of - attribute fork - */ - goto unlock_and_return; - } else { - out.bmv_block = - (map[i].br_startblock == HOLESTARTBLOCK) ? - -1 : - XFS_FSB_TO_DB(ip, map[i].br_startblock); - - /* return either getbmap/getbmapx structure. */ - if (interface & BMV_IF_EXTENDED) { - struct getbmapx outx; - - GETBMAP_CONVERT(out,outx); - outx.bmv_oflags = oflags; - outx.bmv_unused1 = outx.bmv_unused2 = 0; - if (copy_to_user(ap, &outx, - sizeof(outx))) { - error = XFS_ERROR(EFAULT); - goto unlock_and_return; - } - } else { - if (copy_to_user(ap, &out, - sizeof(out))) { - error = XFS_ERROR(EFAULT); - goto unlock_and_return; - } - } - bmv->bmv_offset = - out.bmv_offset + out.bmv_length; - bmv->bmv_length = MAX((__int64_t)0, - (__int64_t)(bmvend - bmv->bmv_offset)); - bmv->bmv_entries++; - ap = (interface & BMV_IF_EXTENDED) ? - (void __user *) - ((struct getbmapx __user *)ap + 1) : - (void __user *) - ((struct getbmap __user *)ap + 1); - } - } - } while (nmap && nexleft && bmv->bmv_length); - -unlock_and_return: - xfs_iunlock_map_shared(ip, lock); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - - kmem_free(map, subnex * sizeof(*map)); - - return error; -} + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); -/* - * Check the last inode extent to determine whether this allocation will result - * in blocks being allocated at the end of the file. When we allocate new data - * blocks at the end of the file which do not start at the previous data block, - * we will try to align the new blocks at stripe unit boundaries. - */ -STATIC int /* error */ -xfs_bmap_isaeof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t off, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - char *aeof) /* return value */ -{ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t s; /* expanded extent record */ + ASSERT(current_ext != NULL); - ASSERT(whichfork == XFS_DATA_FORK); ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *aeof = 1; - return 0; + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; } - /* - * Go to the last extent - */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - xfs_bmbt_get_all(lastrec, &s); - /* - * Check we are allocating in the last extent (for delayed allocations) - * or past the last extent for non-delayed allocations. - */ - *aeof = (off >= s.br_startoff && - off < s.br_startoff + s.br_blockcount && - ISNULLSTARTBLOCK(s.br_startblock)) || - off >= s.br_startoff + s.br_blockcount; - return 0; -} - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary. - */ -int /* error */ -xfs_bmap_eof( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t endoff, /* file offset in fsblocks */ - int whichfork, /* data or attribute fork */ - int *eof) /* result value */ -{ - xfs_fsblock_t blockcount; /* extent block count */ - int error; /* error return value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_t *lastrec; /* extent record pointer */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff; /* extent starting file offset */ - ASSERT(whichfork == XFS_DATA_FORK); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(NULL, ip, whichfork))) - return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *eof = 1; - return 0; - } /* - * Go to the last extent + * If *current_ext is 0, we would need to lookup the extent + * from where we would start shifting and store it in gotp. */ - lastrec = xfs_iext_get_ext(ifp, nextents - 1); - startoff = xfs_bmbt_get_startoff(lastrec); - blockcount = xfs_bmbt_get_blockcount(lastrec); - *eof = endoff >= startoff + blockcount; - return 0; -} - -#ifdef DEBUG -/* - * Check that the extents list for the inode ip is in the right order. - */ -STATIC void -xfs_bmap_check_extents( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_rec_t *ep; /* current extent entry */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extents in list */ - xfs_bmbt_rec_t *nextp; /* next extent entry */ - - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ep = xfs_iext_get_ext(ifp, 0); - for (idx = 0; idx < nextents - 1; idx++) { - nextp = xfs_iext_get_ext(ifp, idx + 1); - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - ep = nextp; - } -} - -STATIC -xfs_buf_t * -xfs_bmap_get_bp( - xfs_btree_cur_t *cur, - xfs_fsblock_t bno) -{ - int i; - xfs_buf_t *bp; - - if (!cur) - return(NULL); - - bp = NULL; - for(i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - bp = cur->bc_bufs[i]; - if (!bp) break; - if (XFS_BUF_ADDR(bp) == bno) - break; /* Found it */ - } - if (i == XFS_BTREE_MAXLEVELS) - bp = NULL; - - if (!bp) { /* Chase down all the log items to see if the bp is there */ - xfs_log_item_chunk_t *licp; - xfs_trans_t *tp; - - tp = cur->bc_tp; - licp = &tp->t_items; - while (!bp && licp != NULL) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { - licp = licp->lic_next; - continue; - } - for (i = 0; i < licp->lic_unused; i++) { - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_buf_log_item_t *bip; - xfs_buf_t *lbp; - - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - lidp = XFS_LIC_SLOT(licp, i); - lip = lidp->lid_item; - if (lip->li_type != XFS_LI_BUF) - continue; - - bip = (xfs_buf_log_item_t *)lip; - lbp = bip->bli_buf; - - if (XFS_BUF_ADDR(lbp) == bno) { - bp = lbp; - break; /* Found it */ - } - } - licp = licp->lic_next; - } - } - return(bp); -} - -void -xfs_check_block( - xfs_bmbt_block_t *block, - xfs_mount_t *mp, - int root, - short sz) -{ - int i, j, dmxr; - xfs_bmbt_ptr_t *pp, *thispa; /* pointer to block address */ - xfs_bmbt_key_t *prevp, *keyp; - - ASSERT(be16_to_cpu(block->bb_level) > 0); - - prevp = NULL; - for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) { - dmxr = mp->m_bmap_dmxr[0]; - - if (root) { - keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz); - } else { - keyp = XFS_BTREE_KEY_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); - } - - if (prevp) { - xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp); - } - prevp = keyp; - + if (!*current_ext) { + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); /* - * Compare the block numbers to see if there are dups. + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) start_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. */ - - if (root) { - pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz); - } else { - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, i, dmxr); - } - for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) { - if (root) { - thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz); - } else { - thispa = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, j, dmxr); - } - if (INT_GET(*thispa, ARCH_CONVERT) == - INT_GET(*pp, ARCH_CONVERT)) { - cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", - __FUNCTION__, j, i, - INT_GET(*thispa, ARCH_CONVERT)); - panic("%s: ptrs are equal in node\n", - __FUNCTION__); - } + if (!gotp) { + *done = 1; + return 0; } } -} - -/* - * Check that the extents for the inode ip are in the right order in all - * btree leaves. - */ - -STATIC void -xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork) /* data or attr fork */ -{ - xfs_bmbt_block_t *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_buf_t *bp; /* buffer for "block" */ - int error; /* error return value */ - xfs_extnum_t i=0, j; /* index into the extents list */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ - xfs_bmbt_rec_t *ep; /* pointer to current extent */ - xfs_bmbt_rec_t *lastp; /* pointer to previous extent */ - xfs_bmbt_rec_t *nextp; /* pointer to next extent */ - int bp_release = 0; - - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE) { - return; - } - - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - block = ifp->if_broot; - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - xfs_check_block(block, mp, 1, ifp->if_broot_bytes); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); - /* - * Go down the tree until leaf level is reached, following the first - * pointer (leftmost) at each level. - */ - while (level-- > 0) { - /* See if buf is in cur first */ - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { - bp_release = 1; - } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - XFS_BMAP_SANITY_CHECK(mp, block, level), - error0); - if (level == 0) - break; - /* - * Check this block for basic sanity (increasing keys and - * no duplicate blocks). - */ - - xfs_check_block(block, mp, 0, 0); - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, block, - 1, mp->m_bmap_dmxr[1]); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, INT_GET(*pp, ARCH_CONVERT)), error0); - bno = INT_GET(*pp, ARCH_CONVERT); - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } + /* We are going to change core inode */ + logflags = XFS_ILOG_CORE; + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstblock; + cur->bc_private.b.flist = flist; + cur->bc_private.b.flags = 0; + } else { + cur = NULL; + logflags |= XFS_ILOG_DEXT; } /* - * Here with bp and block set to the leftmost leaf node in the tree. - */ - i = 0; - - /* - * Loop over all leaf nodes checking that all extents are in the right order. + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot + * use the count of real extents here. Instead we have to calculate it + * from the incore fork. */ - lastp = NULL; - for (;;) { - xfs_fsblock_t nextbno; - xfs_extnum_t num_recs; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + while (nexts++ < num_exts && *current_ext < total_extents) { - - num_recs = be16_to_cpu(block->bb_numrecs); + gotp = xfs_iext_get_ext(ifp, *current_ext); + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; /* - * Read-ahead the next leaf block, if any. + * Before shifting extent into hole, make sure that the hole + * is large enough to accomodate the shift. */ + if (*current_ext) { + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, + *current_ext - 1), &left); - nextbno = be64_to_cpu(block->bb_rightsib); - - /* - * Check all the extents to make sure they are OK. - * If we had a previous block, the last entry should - * conform with the first entry in this one. - */ - - ep = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, 1, mp->m_bmap_dmxr[0]); - for (j = 1; j < num_recs; j++) { - nextp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_bmbt, - block, j + 1, mp->m_bmap_dmxr[0]); - if (lastp) { - xfs_btree_check_rec(XFS_BTNUM_BMAP, - (void *)lastp, (void *)ep); - } - xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep, - (void *)(nextp)); - lastp = ep; - ep = nextp; + if (startoff < left.br_startoff + left.br_blockcount) + error = XFS_ERROR(EINVAL); + } else if (offset_shift_fsb > got.br_startoff) { + /* + * When first extent is shifted, offset_shift_fsb + * should be less than the stating offset of + * the first extent. + */ + error = XFS_ERROR(EINVAL); } - i += num_recs; - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - bno = nextbno; - /* - * If we've reached the end, stop. - */ - if (bno == NULLFSBLOCK) - break; + if (error) + goto del_cursor; - bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno)); - if (bp) { - bp_release = 0; - } else { - bp_release = 1; + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); } - if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - goto error_norelse; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - } - if (bp_release) { - bp_release = 0; - xfs_trans_brelse(NULL, bp); - } - return; -error0: - cmn_err(CE_WARN, "%s: at error0", __FUNCTION__); - if (bp_release) - xfs_trans_brelse(NULL, bp); -error_norelse: - cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", - __FUNCTION__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__); - return; -} -#endif + /* Check if we can merge 2 adjacent extents */ + if (*current_ext && + left.br_startoff + left.br_blockcount == startoff && + left.br_startblock + left.br_blockcount == + got.br_startblock && + left.br_state == got.br_state && + left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { + blockcount = left.br_blockcount + + got.br_blockcount; + xfs_iext_remove(ip, *current_ext, 1, 0); + if (cur) { + error = xfs_btree_delete(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + gotp = xfs_iext_get_ext(ifp, --*current_ext); + xfs_bmbt_get_all(gotp, &got); -/* - * Count fsblocks of the given fork. - */ -int /* error */ -xfs_bmap_count_blocks( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - int *count) /* out: count of blocks */ -{ - xfs_bmbt_block_t *block; /* current btree block */ - xfs_fsblock_t bno; /* block # of "block" */ - xfs_ifork_t *ifp; /* fork structure */ - int level; /* btree level, for checking */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_bmbt_ptr_t *pp; /* pointer to block address */ + /* Make cursor point to the extent we will update */ + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); + } - bno = NULLFSBLOCK; - mp = ip->i_mount; - ifp = XFS_IFORK_PTR(ip, whichfork); - if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - if (unlikely(xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); + xfs_bmbt_set_blockcount(gotp, blockcount); + got.br_blockcount = blockcount; + } else { + /* We have to update the startoff */ + xfs_bmbt_set_startoff(gotp, startoff); + got.br_startoff = startoff; } - return 0; - } - - /* - * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. - */ - block = ifp->if_broot; - level = be16_to_cpu(block->bb_level); - ASSERT(level > 0); - pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes); - ASSERT(INT_GET(*pp, ARCH_CONVERT) != NULLDFSBNO); - ASSERT(XFS_FSB_TO_AGNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agcount); - ASSERT(XFS_FSB_TO_AGBNO(mp, INT_GET(*pp, ARCH_CONVERT)) < mp->m_sb.sb_agblocks); - bno = INT_GET(*pp, ARCH_CONVERT); - - if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { - XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } - return 0; -} - -/* - * Recursively walks each level of a btree - * to count total fsblocks is use. - */ -int /* error */ -xfs_bmap_count_tree( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fsblock_t blockno, /* file system block number */ - int levelin, /* level in btree */ - int *count) /* Count of blocks */ -{ - int error; - xfs_buf_t *bp, *nbp; - int level = levelin; - xfs_bmbt_ptr_t *pp; - xfs_fsblock_t bno = blockno; - xfs_fsblock_t nextbno; - xfs_bmbt_block_t *block, *nextblock; - int numrecs; - - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF))) - return error; - *count += 1; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - - if (--level) { - /* Not at node above leafs, count this level of nodes */ - nextbno = be64_to_cpu(block->bb_rightsib); - while (nextbno != NULLFSBLOCK) { - if ((error = xfs_btree_read_bufl(mp, tp, nextbno, - 0, &nbp, XFS_BMAP_BTREE_REF))) - return error; - *count += 1; - nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp); - nextbno = be64_to_cpu(nextblock->bb_rightsib); - xfs_trans_brelse(tp, nbp); + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; } - /* Dive to the next level */ - pp = XFS_BTREE_PTR_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]); - bno = INT_GET(*pp, ARCH_CONVERT); - if (unlikely((error = - xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - } else { - /* count all level 1 nodes and their leaves */ - for (;;) { - nextbno = be64_to_cpu(block->bb_rightsib); - numrecs = be16_to_cpu(block->bb_numrecs); - if (unlikely(xfs_bmap_disk_count_leaves(ifp, mp, - 0, block, numrecs, count) < 0)) { - xfs_trans_brelse(tp, bp); - XFS_ERROR_REPORT("xfs_bmap_count_tree(2)", - XFS_ERRLEVEL_LOW, mp); - return XFS_ERROR(EFSCORRUPTED); - } - xfs_trans_brelse(tp, bp); - if (nextbno == NULLFSBLOCK) - break; - bno = nextbno; - if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, - XFS_BMAP_BTREE_REF))) - return error; - *count += 1; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - } + (*current_ext)++; + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); } - return 0; -} -/* - * Count leaf blocks given a range of extent records. - */ -int -xfs_bmap_count_leaves( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; - - for (b = 0; b < numrecs; b++) { - frp = xfs_iext_get_ext(ifp, idx + b); - *count += xfs_bmbt_get_blockcount(frp); - } - return 0; -} + /* Check if we are done */ + if (*current_ext == total_extents) + *done = 1; -/* - * Count leaf blocks given a range of extent records originally - * in btree format. - */ -int -xfs_bmap_disk_count_leaves( - xfs_ifork_t *ifp, - xfs_mount_t *mp, - xfs_extnum_t idx, - xfs_bmbt_block_t *block, - int numrecs, - int *count) -{ - int b; - xfs_bmbt_rec_t *frp; +del_cursor: + if (cur) + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); - for (b = 1; b <= numrecs; b++) { - frp = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, - xfs_bmbt, block, idx + b, mp->m_bmap_dmxr[0]); - *count += xfs_bmbt_disk_get_blockcount(frp); - } - return 0; + xfs_trans_log_inode(tp, ip, logflags); + return error; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 80e93409b78..b879ca56a64 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -25,19 +25,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; -/* - * DELTA: describe a change to the in-core extent list. - * - * Internally the use of xed_blockount is somewhat funky. - * xed_blockcount contains an offset much of the time because this - * makes merging changes easier. (xfs_fileoff_t and xfs_filblks_t are - * the same underlying type). - */ -typedef struct xfs_extdelta -{ - xfs_fileoff_t xed_startoff; /* offset of range */ - xfs_filblks_t xed_blockcount; /* blocks in range */ -} xfs_extdelta_t; +extern kmem_zone_t *xfs_bmap_free_item_zone; /* * List of extents to be free "later". @@ -52,37 +40,54 @@ typedef struct xfs_bmap_free_item /* * Header for free extent list. + * + * xbf_low is used by the allocator to activate the lowspace algorithm - + * when free space is running low the extent allocator may choose to + * allocate an extent from an AG without leaving sufficient space for + * a btree split when inserting the new extent. In this case the allocator + * will enable the lowspace algorithm which is supposed to allow further + * allocations (such as btree splits and newroots) to allocate from + * sequential AGs. In order to avoid locking AGs out of order the lowspace + * algorithm will start searching for free space from AG 0. If the correct + * transaction reservations have been made then this algorithm will eventually + * find all the space it needs. */ typedef struct xfs_bmap_free { xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ int xbf_count; /* count of items on list */ - int xbf_low; /* kludge: alloc in low mode */ + int xbf_low; /* alloc in low mode */ } xfs_bmap_free_t; #define XFS_BMAP_MAX_NMAP 4 /* - * Flags for xfs_bmapi + * Flags for xfs_bmapi_* */ -#define XFS_BMAPI_WRITE 0x001 /* write operation: allocate space */ -#define XFS_BMAPI_DELAY 0x002 /* delayed write operation */ -#define XFS_BMAPI_ENTIRE 0x004 /* return entire extent, not trimmed */ -#define XFS_BMAPI_METADATA 0x008 /* mapping metadata not user data */ -#define XFS_BMAPI_EXACT 0x010 /* allocate only to spec'd bounds */ -#define XFS_BMAPI_ATTRFORK 0x020 /* use attribute fork not data */ -#define XFS_BMAPI_ASYNC 0x040 /* bunmapi xactions can be async */ -#define XFS_BMAPI_RSVBLOCKS 0x080 /* OK to alloc. reserved data blocks */ -#define XFS_BMAPI_PREALLOC 0x100 /* preallocation op: unwritten space */ -#define XFS_BMAPI_IGSTATE 0x200 /* Ignore state - */ +#define XFS_BMAPI_ENTIRE 0x001 /* return entire extent, not trimmed */ +#define XFS_BMAPI_METADATA 0x002 /* mapping metadata not user data */ +#define XFS_BMAPI_ATTRFORK 0x004 /* use attribute fork not data */ +#define XFS_BMAPI_PREALLOC 0x008 /* preallocation op: unwritten space */ +#define XFS_BMAPI_IGSTATE 0x010 /* Ignore state - */ /* combine contig. space */ -#define XFS_BMAPI_CONTIG 0x400 /* must allocate only one extent */ -/* XFS_BMAPI_DIRECT_IO 0x800 */ -#define XFS_BMAPI_CONVERT 0x1000 /* unwritten extent conversion - */ - /* need write cache flushing and no */ - /* additional allocation alignments */ +#define XFS_BMAPI_CONTIG 0x020 /* must allocate only one extent */ +/* + * unwritten extent conversion - this needs write cache flushing and no additional + * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts + * from written to unwritten, otherwise convert from unwritten to written. + */ +#define XFS_BMAPI_CONVERT 0x040 + +#define XFS_BMAPI_FLAGS \ + { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ + { XFS_BMAPI_METADATA, "METADATA" }, \ + { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ + { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ + { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ + { XFS_BMAPI_CONTIG, "CONTIG" }, \ + { XFS_BMAPI_CONVERT, "CONVERT" } + -#define XFS_BMAPI_AFLAG(w) xfs_bmapi_aflag(w) static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0); @@ -94,7 +99,6 @@ static inline int xfs_bmapi_aflag(int w) #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) -#define XFS_BMAP_INIT(flp,fbp) xfs_bmap_init(flp,fbp) static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) { ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ @@ -102,285 +106,81 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) } /* - * Argument structure for xfs_bmap_alloc. + * Flags for xfs_bmap_add_extent*. */ -typedef struct xfs_bmalloca { - xfs_fsblock_t firstblock; /* i/o first block allocated */ - xfs_fsblock_t rval; /* starting block of new extent */ - xfs_fileoff_t off; /* offset in file filling in */ - struct xfs_trans *tp; /* transaction pointer */ - struct xfs_inode *ip; /* incore inode pointer */ - struct xfs_bmbt_irec *prevp; /* extent before the new one */ - struct xfs_bmbt_irec *gotp; /* extent after, or delayed */ - xfs_extlen_t alen; /* i/o length asked/allocated */ - xfs_extlen_t total; /* total blocks needed for xaction */ - xfs_extlen_t minlen; /* mininum allocation size (blocks) */ - xfs_extlen_t minleft; /* amount must be left after alloc */ - char eof; /* set if allocating past last extent */ - char wasdel; /* replacing a delayed allocation */ - char userdata;/* set if is user data */ - char low; /* low on space, using seq'l ags */ - char aeof; /* allocated space at eof */ - char conv; /* overwriting unwritten extents */ -} xfs_bmalloca_t; +#define BMAP_LEFT_CONTIG (1 << 0) +#define BMAP_RIGHT_CONTIG (1 << 1) +#define BMAP_LEFT_FILLING (1 << 2) +#define BMAP_RIGHT_FILLING (1 << 3) +#define BMAP_LEFT_DELAY (1 << 4) +#define BMAP_RIGHT_DELAY (1 << 5) +#define BMAP_LEFT_VALID (1 << 6) +#define BMAP_RIGHT_VALID (1 << 7) +#define BMAP_ATTRFORK (1 << 8) -#ifdef __KERNEL__ +#define XFS_BMAP_EXT_FLAGS \ + { BMAP_LEFT_CONTIG, "LC" }, \ + { BMAP_RIGHT_CONTIG, "RC" }, \ + { BMAP_LEFT_FILLING, "LF" }, \ + { BMAP_RIGHT_FILLING, "RF" }, \ + { BMAP_ATTRFORK, "ATTR" } -#if defined(XFS_BMAP_TRACE) -/* - * Trace operations for bmap extent tracing - */ -#define XFS_BMAP_KTRACE_DELETE 1 -#define XFS_BMAP_KTRACE_INSERT 2 -#define XFS_BMAP_KTRACE_PRE_UP 3 -#define XFS_BMAP_KTRACE_POST_UP 4 - -#define XFS_BMAP_TRACE_SIZE 4096 /* size of global trace buffer */ -#define XFS_BMAP_KTRACE_SIZE 32 /* size of per-inode trace buffer */ -extern ktrace_t *xfs_bmap_trace_buf; /* - * Add bmap trace insert entries for all the contents of the extent list. + * This macro is used to determine how many extents will be shifted + * in one write transaction. We could require two splits, + * an extent move on the first and an extent merge on the second, + * So it is proper that one extent is shifted inside write transaction + * at a time. */ -void -xfs_bmap_trace_exlist( - char *fname, /* function name */ - struct xfs_inode *ip, /* incore inode pointer */ - xfs_extnum_t cnt, /* count of entries in list */ - int whichfork); /* data or attr fork */ +#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 + +#ifdef DEBUG +void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, + int whichfork, unsigned long caller_ip); +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ + xfs_bmap_trace_exlist(ip,c,w, _THIS_IP_) #else -#define xfs_bmap_trace_exlist(f,ip,c,w) +#define XFS_BMAP_TRACE_EXLIST(ip,c,w) #endif -/* - * Convert inode from non-attributed to attributed. - * Must not be in a transaction, ip must not be locked. - */ -int /* error code */ -xfs_bmap_add_attrfork( - struct xfs_inode *ip, /* incore inode pointer */ - int size, /* space needed for new attribute */ - int rsvd); /* flag for reserved block allocation */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -xfs_bmap_add_free( - xfs_fsblock_t bno, /* fs block number of extent */ - xfs_filblks_t len, /* length of extent */ - xfs_bmap_free_t *flist, /* list of extents */ - struct xfs_mount *mp); /* mount point structure */ - -/* - * Routine to clean up the free list data structure when - * an error occurs during a transaction. - */ -void -xfs_bmap_cancel( - xfs_bmap_free_t *flist); /* free list to clean up */ - -/* - * Compute and fill in the value of the maximum depth of a bmap btree - * in this filesystem. Done once, during mount. - */ -void -xfs_bmap_compute_maxlevels( - struct xfs_mount *mp, /* file system mount structure */ - int whichfork); /* data or attr fork */ - -/* - * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi - * caller. Frees all the extents that need freeing, which must be done - * last due to locking considerations. - * - * Return 1 if the given transaction was committed and a new one allocated, - * and 0 otherwise. - */ -int /* error */ -xfs_bmap_finish( - struct xfs_trans **tp, /* transaction pointer addr */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_fsblock_t firstblock, /* controlled a.g. for allocs */ - int *committed); /* xact committed or not */ - -/* - * Returns the file-relative block number of the first unused block in the file. - * This is the lowest-address hole if the file has holes, else the first block - * past the end of file. - */ -int /* error */ -xfs_bmap_first_unused( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_extlen_t len, /* size of hole to find */ - xfs_fileoff_t *unused, /* unused block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the last block + 1 before - * last_block (input value) in the file. - * This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_before( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork); /* data or attr fork */ - -/* - * Returns the file-relative block number of the first block past eof in - * the file. This is not based on i_size, it is based on the extent list. - * Returns 0 for local files, as they do not have an extent list. - */ -int /* error */ -xfs_bmap_last_offset( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t *unused, /* last block num */ - int whichfork); /* data or attr fork */ - -/* - * Returns whether the selected fork of the inode has exactly one - * block or not. For the data fork we check this matches di_size, - * implying the file's range is 0..bsize-1. - */ -int -xfs_bmap_one_block( - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Read in the extents to iu_extents. - * All inode fields are set up by caller, we just traverse the btree - * and copy the records in. - */ -int /* error */ -xfs_bmap_read_extents( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork); /* data or attr fork */ - -/* - * Map file blocks to filesystem blocks. - * File range is given by the bno/len pair. - * Adds blocks to file if a write ("flags & XFS_BMAPI_WRITE" set) - * into a hole or past eof. - * Only allocates blocks from a single allocation group, - * to avoid locking problems. - * The returned value in "firstblock" from the first call in a transaction - * must be remembered and presented to subsequent calls in "firstblock". - * An upper bound for the number of blocks to be allocated is supplied to - * the first call in "total"; if no allocation group has that many free - * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). - */ -int /* error */ -xfs_bmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting file offs. mapped */ - xfs_filblks_t len, /* length to map in file */ - int flags, /* XFS_BMAPI_... */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_extlen_t total, /* total blocks needed */ - struct xfs_bmbt_irec *mval, /* output: map values */ - int *nmap, /* i/o: mval size/count */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta); /* o: change made to incore - extents */ - -/* - * Map file blocks to filesystem blocks, simple version. - * One block only, read-only. - * For flags, only the XFS_BMAPI_ATTRFORK flag is examined. - * For the other flag values, the effect is as if XFS_BMAPI_METADATA - * was set and all the others were clear. - */ -int /* error */ -xfs_bmapi_single( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - int whichfork, /* data or attr fork */ - xfs_fsblock_t *fsb, /* output: mapped block */ - xfs_fileoff_t bno); /* starting file offs. mapped */ - -/* - * Unmap (remove) blocks from a file. - * If nexts is nonzero then the number of extents to remove is limited to - * that value. If not all extents in the block range can be removed then - * *done is set. - */ -int /* error */ -xfs_bunmapi( - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_inode *ip, /* incore inode */ - xfs_fileoff_t bno, /* starting offset to unmap */ - xfs_filblks_t len, /* length to unmap in file */ - int flags, /* XFS_BMAPI_... */ - xfs_extnum_t nexts, /* number of extents max */ - xfs_fsblock_t *firstblock, /* first allocated block - controls a.g. for allocs */ - xfs_bmap_free_t *flist, /* i/o: list extents to free */ - xfs_extdelta_t *delta, /* o: change made to incore - extents */ - int *done); /* set if not done yet */ - -/* - * Fcntl interface to xfs_bmapi. - */ -int /* error code */ -xfs_getbmap( - bhv_desc_t *bdp, /* XFS behavior descriptor*/ - struct getbmap *bmv, /* user bmap structure */ - void __user *ap, /* pointer to user's array */ - int iflags); /* interface flags */ - -/* - * Check if the endoff is outside the last extent. If so the caller will grow - * the allocation to a stripe unit boundary - */ -int -xfs_bmap_eof( - struct xfs_inode *ip, - xfs_fileoff_t endoff, - int whichfork, - int *eof); - -/* - * Count fsblocks of the given fork. - */ -int -xfs_bmap_count_blocks( - xfs_trans_t *tp, - struct xfs_inode *ip, - int whichfork, - int *count); - -/* - * Check an extent list, which has just been read, for - * any bit in the extent flag field. - */ -int -xfs_check_nostate_extents( - struct xfs_ifork *ifp, - xfs_extnum_t idx, - xfs_extnum_t num); - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_t * -xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, - xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); - -#endif /* __KERNEL__ */ +int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); +void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, + struct xfs_bmap_free *flist, struct xfs_mount *mp); +void xfs_bmap_cancel(struct xfs_bmap_free *flist); +void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); +int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); +int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t *last_block, int whichfork); +int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, + int whichfork); +int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork); +int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork); +int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_delay(struct xfs_inode *ip, xfs_fileoff_t bno, + xfs_filblks_t len, struct xfs_bmbt_irec *mval, + int *nmap, int flags); +int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_fsblock_t *firstblock, xfs_extlen_t total, + struct xfs_bmbt_irec *mval, int *nmap, + struct xfs_bmap_free *flist); +int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, int flags, + xfs_extnum_t nexts, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int *done); +int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, + xfs_extnum_t num); +uint xfs_default_attroffset(struct xfs_inode *ip); +int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, + int *done, xfs_fileoff_t start_fsb, + xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, + xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c index 18fb7385d71..948836c4fd9 100644 --- a/fs/xfs/xfs_bmap_btree.c +++ b/fs/xfs/xfs_bmap_btree.c @@ -17,1497 +17,26 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_error.h" #include "xfs_quota.h" - -#if defined(XFS_BMBT_TRACE) -ktrace_t *xfs_bmbt_trace_buf; -#endif - -/* - * Prototypes for internal btree functions. - */ - - -STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *); -STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *, - xfs_bmbt_key_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int); - - -#if defined(XFS_BMBT_TRACE) - -static char ARGS[] = "args"; -static char ENTRY[] = "entry"; -static char ERROR[] = "error"; -#undef EXIT -static char EXIT[] = "exit"; - -/* - * Add a trace buffer entry for the arguments given to the routine, - * generic form. - */ -STATIC void -xfs_bmbt_trace_enter( - char *func, - xfs_btree_cur_t *cur, - char *s, - int type, - int line, - __psunsigned_t a0, - __psunsigned_t a1, - __psunsigned_t a2, - __psunsigned_t a3, - __psunsigned_t a4, - __psunsigned_t a5, - __psunsigned_t a6, - __psunsigned_t a7, - __psunsigned_t a8, - __psunsigned_t a9, - __psunsigned_t a10) -{ - xfs_inode_t *ip; - int whichfork; - - ip = cur->bc_private.b.ip; - whichfork = cur->bc_private.b.whichfork; - ktrace_enter(xfs_bmbt_trace_buf, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); - ASSERT(ip->i_btrace); - ktrace_enter(ip->i_btrace, - (void *)((__psint_t)type | (whichfork << 8) | (line << 16)), - (void *)func, (void *)s, (void *)ip, (void *)cur, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)a8, (void *)a9, (void *)a10); -} -/* - * Add a trace buffer entry for arguments, for a buffer & 1 integer arg. - */ -STATIC void -xfs_bmbt_trace_argbi( - char *func, - xfs_btree_cur_t *cur, - xfs_buf_t *b, - int i, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line, - (__psunsigned_t)b, i, 0, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for a buffer & 2 integer args. - */ -STATIC void -xfs_bmbt_trace_argbii( - char *func, - xfs_btree_cur_t *cur, - xfs_buf_t *b, - int i0, - int i1, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line, - (__psunsigned_t)b, i0, i1, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for 3 block-length args - * and an integer arg. - */ -STATIC void -xfs_bmbt_trace_argfffi( - char *func, - xfs_btree_cur_t *cur, - xfs_dfiloff_t o, - xfs_dfsbno_t b, - xfs_dfilblks_t i, - int j, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line, - o >> 32, (int)o, b >> 32, (int)b, - i >> 32, (int)i, (int)j, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for one integer arg. - */ -STATIC void -xfs_bmbt_trace_argi( - char *func, - xfs_btree_cur_t *cur, - int i, - int line) -{ - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line, - i, 0, 0, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, key. - */ -STATIC void -xfs_bmbt_trace_argifk( - char *func, - xfs_btree_cur_t *cur, - int i, - xfs_fsblock_t f, - xfs_bmbt_key_t *k, - int line) -{ - xfs_dfsbno_t d; - xfs_dfiloff_t o; - - d = (xfs_dfsbno_t)f; - o = INT_GET(k->br_startoff, ARCH_CONVERT); - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, d >> 32, (int)d, o >> 32, - (int)o, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, fsblock, rec. - */ -STATIC void -xfs_bmbt_trace_argifr( - char *func, - xfs_btree_cur_t *cur, - int i, - xfs_fsblock_t f, - xfs_bmbt_rec_t *r, - int line) -{ - xfs_dfsbno_t b; - xfs_dfilblks_t c; - xfs_dfsbno_t d; - xfs_dfiloff_t o; - xfs_bmbt_irec_t s; - - d = (xfs_dfsbno_t)f; - xfs_bmbt_disk_get_all(r, &s); - o = (xfs_dfiloff_t)s.br_startoff; - b = (xfs_dfsbno_t)s.br_startblock; - c = s.br_blockcount; - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line, - i, d >> 32, (int)d, o >> 32, - (int)o, b >> 32, (int)b, c >> 32, - (int)c, 0, 0); -} - -/* - * Add a trace buffer entry for arguments, for int, key. - */ -STATIC void -xfs_bmbt_trace_argik( - char *func, - xfs_btree_cur_t *cur, - int i, - xfs_bmbt_key_t *k, - int line) -{ - xfs_dfiloff_t o; - - o = INT_GET(k->br_startoff, ARCH_CONVERT); - xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line, - i, o >> 32, (int)o, 0, - 0, 0, 0, 0, - 0, 0, 0); -} - -/* - * Add a trace buffer entry for the cursor/operation. - */ -STATIC void -xfs_bmbt_trace_cursor( - char *func, - xfs_btree_cur_t *cur, - char *s, - int line) -{ - xfs_bmbt_rec_t r; - - xfs_bmbt_set_all(&r, &cur->bc_rec.b); - xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line, - (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) | - cur->bc_private.b.allocated, - INT_GET(r.l0, ARCH_CONVERT) >> 32, (int)INT_GET(r.l0, ARCH_CONVERT), INT_GET(r.l1, ARCH_CONVERT) >> 32, (int)INT_GET(r.l1, ARCH_CONVERT), - (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1], - (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3], - (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1], - (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]); -} - -#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ - xfs_bmbt_trace_argbi(fname, c, b, i, __LINE__) -#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ - xfs_bmbt_trace_argbii(fname, c, b, i, j, __LINE__) -#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ - xfs_bmbt_trace_argfffi(fname, c, o, b, i, j, __LINE__) -#define XFS_BMBT_TRACE_ARGI(c,i) \ - xfs_bmbt_trace_argi(fname, c, i, __LINE__) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) \ - xfs_bmbt_trace_argifk(fname, c, i, f, k, __LINE__) -#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ - xfs_bmbt_trace_argifr(fname, c, i, f, r, __LINE__) -#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ - xfs_bmbt_trace_argik(fname, c, i, k, __LINE__) -#define XFS_BMBT_TRACE_CURSOR(c,s) \ - xfs_bmbt_trace_cursor(fname, c, s, __LINE__) -#else -#define XFS_BMBT_TRACE_ARGBI(c,b,i) -#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) -#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) -#define XFS_BMBT_TRACE_ARGI(c,i) -#define XFS_BMBT_TRACE_ARGIFK(c,i,f,k) -#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) -#define XFS_BMBT_TRACE_ARGIK(c,i,k) -#define XFS_BMBT_TRACE_CURSOR(c,s) -#endif /* XFS_BMBT_TRACE */ - - -/* - * Internal functions. - */ - -/* - * Delete record pointed to by cur/level. - */ -STATIC int /* error */ -xfs_bmbt_delrec( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_fsblock_t bno; /* fs-relative block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delrec"; -#endif - int i; /* loop counter */ - int j; /* temp state */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ - xfs_fsblock_t lbno; /* left sibling block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - int lrecs=0; /* left record count */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - int ptr; /* key/record index */ - xfs_fsblock_t rbno; /* right sibling block number */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_block_t *rrblock; /* right-right btree block */ - xfs_buf_t *rrbp; /* right-right buffer pointer */ - int rrecs=0; /* right record count */ - xfs_bmbt_rec_t *rrp; /* right record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ - int numrecs; /* temporary numrec count */ - int numlrecs, numrrecs; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ptr = cur->bc_ptrs[level]; - tcur = (xfs_btree_cur_t *)0; - if (ptr == 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - block = xfs_bmbt_get_block(cur, level, &bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - if (ptr > numrecs) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_bmbt_delrec); - if (level > 0) { - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } -#endif - if (ptr < numrecs) { - memmove(&kp[ptr - 1], &kp[ptr], - (numrecs - ptr) * sizeof(*kp)); - memmove(&pp[ptr - 1], &pp[ptr], /* INT_: direct copy */ - (numrecs - ptr) * sizeof(*pp)); - xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1); - xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1); - } - } else { - rp = XFS_BMAP_REC_IADDR(block, 1, cur); - if (ptr < numrecs) { - memmove(&rp[ptr - 1], &rp[ptr], - (numrecs - ptr) * sizeof(*rp)); - xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1); - } - if (ptr == 1) { - INT_SET(key.br_startoff, ARCH_CONVERT, xfs_bmbt_disk_get_startoff(rp)); - kp = &key; - } - } - numrecs--; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); - /* - * We're at the root level. - * First, shrink the root block in-memory. - * Try to get rid of the next level down. - * If we can't then there's nothing left to do. - */ - if (level == cur->bc_nlevels - 1) { - xfs_iroot_realloc(cur->bc_private.b.ip, -1, - cur->bc_private.b.whichfork); - if ((error = xfs_bmbt_killroot(cur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - rbno = be64_to_cpu(block->bb_rightsib); - lbno = be64_to_cpu(block->bb_leftsib); - /* - * One child of root, need to get a chance to copy its contents - * into the root and delete it. Can't go up to next level, - * there's nothing to delete there. - */ - if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK && - level == cur->bc_nlevels - 2) { - if ((error = xfs_bmbt_killroot(cur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK); - if ((error = xfs_btree_dup_cursor(cur, &tcur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - bno = NULLFSBLOCK; - if (rbno != NULLFSBLOCK) { - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_increment(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - bno = be64_to_cpu(right->bb_leftsib); - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if ((error = xfs_bmbt_lshift(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_BMAP_BLOCK_IMINRECS(level, tcur)); - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - if (level > 0) { - if ((error = xfs_bmbt_decrement(cur, - level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, - ERROR); - goto error0; - } - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLFSBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_decrement(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } - } - if (lbno != NULLFSBLOCK) { - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * decrement to last in block - */ - if ((error = xfs_bmbt_decrement(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_BMBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } -#endif - bno = be64_to_cpu(left->bb_rightsib); - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_BMAP_BLOCK_IMINRECS(level, cur)) { - if ((error = xfs_bmbt_rshift(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_BMAP_BLOCK_IMINRECS(level, tcur)); - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - if (level == 0) - cur->bc_ptrs[0]++; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - lrecs = be16_to_cpu(left->bb_numrecs); - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - tcur = NULL; - mp = cur->bc_mp; - ASSERT(bno != NULLFSBLOCK); - if (lbno != NULLFSBLOCK && - lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - rbno = bno; - right = block; - rbp = bp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } else if (rbno != NULLFSBLOCK && - rrecs + be16_to_cpu(block->bb_numrecs) <= - XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - lbno = bno; - left = block; - lbp = bp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - right = XFS_BUF_TO_BMBT_BLOCK(rbp); - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - lrecs = be16_to_cpu(left->bb_numrecs); - } else { - if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - numlrecs = be16_to_cpu(left->bb_numrecs); - numrrecs = be16_to_cpu(right->bb_numrecs); - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur); - lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < numrrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - } -#endif - memcpy(lkp, rkp, numrrecs * sizeof(*lkp)); - memcpy(lpp, rpp, numrrecs * sizeof(*lpp)); - xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - } else { - lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memcpy(lrp, rrp, numrrecs * sizeof(*lrp)); - xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs); - } - be16_add(&left->bb_numrecs, numrrecs); - left->bb_rightsib = right->bb_rightsib; - xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS); - if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) { - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, - be64_to_cpu(left->bb_rightsib), - 0, &rrbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); - if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - rrblock->bb_leftsib = cpu_to_be64(lbno); - xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); - } - xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1, - cur->bc_private.b.flist, mp); - cur->bc_private.b.ip->i_d.di_nblocks--; - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(cur->bc_tp, rbp); - if (bp != lbp) { - cur->bc_bufs[level] = lbp; - cur->bc_ptrs[level] += lrecs; - cur->bc_ra[level] = 0; - } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - goto error0; - } - if (level > 0) - cur->bc_ptrs[level]--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 2; - return 0; - -error0: - if (tcur) - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; -} - -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -int -xfs_bmbt_get_rec( - xfs_btree_cur_t *cur, - xfs_fileoff_t *off, - xfs_fsblock_t *bno, - xfs_filblks_t *len, - xfs_exntst_t *state, - int *stat) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; -#ifdef DEBUG - int error; -#endif - int ptr; - xfs_bmbt_rec_t *rp; - - block = xfs_bmbt_get_block(cur, 0, &bp); - ptr = cur->bc_ptrs[0]; -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) - return error; -#endif - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - *off = xfs_bmbt_disk_get_startoff(rp); - *bno = xfs_bmbt_disk_get_startblock(rp); - *len = xfs_bmbt_disk_get_blockcount(rp); - *state = xfs_bmbt_disk_get_state(rp); - *stat = 1; - return 0; -} -#endif - -/* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. - */ -STATIC int /* error */ -xfs_bmbt_insrec( - xfs_btree_cur_t *cur, - int level, - xfs_fsblock_t *bnop, - xfs_bmbt_rec_t *recp, - xfs_btree_cur_t **curp, - int *stat) /* no-go/done/continue */ -{ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insrec"; -#endif - int i; /* loop index */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */ - int logflags; /* inode logging flags */ - xfs_fsblock_t nbno; /* new block number */ - struct xfs_btree_cur *ncur; /* new btree cursor */ - xfs_bmbt_key_t nkey; /* new btree key value */ - xfs_bmbt_rec_t nrec; /* new record count */ - int optr; /* old key/record index */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - int ptr; /* key/record index */ - xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */ - int numrecs; - - ASSERT(level < cur->bc_nlevels); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp); - ncur = (xfs_btree_cur_t *)0; - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(recp)); - optr = ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - XFS_STATS_INC(xs_bmbt_insrec); - block = xfs_bmbt_get_block(cur, level, &bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (ptr <= numrecs) { - if (level == 0) { - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp); - } else { - kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); - xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp); - } - } -#endif - nbno = NULLFSBLOCK; - if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { - /* - * A root block, that can be made bigger. - */ - xfs_iroot_realloc(cur->bc_private.b.ip, 1, - cur->bc_private.b.whichfork); - block = xfs_bmbt_get_block(cur, level, &bp); - } else if (level == cur->bc_nlevels - 1) { - if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) || - *stat == 0) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, - logflags); - block = xfs_bmbt_get_block(cur, level, &bp); - } else { - if ((error = xfs_bmbt_rshift(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (i) { - /* nothing */ - } else { - if ((error = xfs_bmbt_lshift(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (i) { - optr = ptr = cur->bc_ptrs[level]; - } else { - if ((error = xfs_bmbt_split(cur, level, - &nbno, &nkey, &ncur, - &i))) { - XFS_BMBT_TRACE_CURSOR(cur, - ERROR); - return error; - } - if (i) { - block = xfs_bmbt_get_block( - cur, level, &bp); -#ifdef DEBUG - if ((error = - xfs_btree_check_lblock(cur, - block, level, bp))) { - XFS_BMBT_TRACE_CURSOR( - cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[level]; - xfs_bmbt_disk_set_allf(&nrec, - nkey.br_startoff, 0, 0, - XFS_EXT_NORM); - } else { - XFS_BMBT_TRACE_CURSOR(cur, - EXIT); - *stat = 0; - return 0; - } - } - } - } - } - numrecs = be16_to_cpu(block->bb_numrecs); - if (level > 0) { - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); -#ifdef DEBUG - for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], /* INT_: direct copy */ - (numrecs - ptr + 1) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)*bnop, - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - kp[ptr - 1] = key; - INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_keys(cur, bp, ptr, numrecs); - xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs); - } else { - rp = XFS_BMAP_REC_IADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*rp)); - rp[ptr - 1] = *recp; - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_bmbt_log_recs(cur, bp, ptr, numrecs); - } - xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (ptr < numrecs) { - if (level == 0) - xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1, - rp + ptr); - else - xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1, - kp + ptr); - } -#endif - if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - *bnop = nbno; - if (nbno != NULLFSBLOCK) { - *recp = nrec; - *curp = ncur; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -STATIC int -xfs_bmbt_killroot( - xfs_btree_cur_t *cur) -{ - xfs_bmbt_block_t *block; - xfs_bmbt_block_t *cblock; - xfs_buf_t *cbp; - xfs_bmbt_key_t *ckp; - xfs_bmbt_ptr_t *cpp; -#ifdef DEBUG - int error; -#endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_killroot"; -#endif - int i; - xfs_bmbt_key_t *kp; - xfs_inode_t *ip; - xfs_ifork_t *ifp; - int level; - xfs_bmbt_ptr_t *pp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = cur->bc_nlevels - 1; - ASSERT(level >= 1); - /* - * Don't deal with the root block needs to be a leaf case. - * We're just going to turn the thing back into extents anyway. - */ - if (level == 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - block = xfs_bmbt_get_block(cur, level, &cbp); - /* - * Give up if the root has multiple children. - */ - if (be16_to_cpu(block->bb_numrecs) != 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - /* - * Only do this if the next level will fit. - * Then the data must be copied up to the inode, - * instead of freeing the root you free the next level. - */ - cbp = cur->bc_bufs[level - 1]; - cblock = XFS_BUF_TO_BMBT_BLOCK(cbp); - if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO); - ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO); - ip = cur->bc_private.b.ip; - ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork); - ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) == - XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes)); - i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur)); - if (i) { - xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork); - block = ifp->if_broot; - } - be16_add(&block->bb_numrecs, i); - ASSERT(block->bb_numrecs == cblock->bb_numrecs); - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); - memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp)); - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(cpp[i], ARCH_CONVERT), level - 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp)); - xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1, - cur->bc_private.b.flist, cur->bc_mp); - ip->i_d.di_nblocks--; - XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip, - XFS_TRANS_DQ_BCOUNT, -1L); - xfs_trans_binval(cur->bc_tp, cbp); - cur->bc_bufs[level - 1] = NULL; - be16_add(&block->bb_level, -1); - xfs_trans_log_inode(cur->bc_tp, ip, - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - cur->bc_nlevels--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; -} - -/* - * Log key values from the btree block. - */ -STATIC void -xfs_bmbt_log_keys( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int kfirst, - int klast) -{ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_keys"; -#endif - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast); - tp = cur->bc_tp; - if (bp) { - xfs_bmbt_block_t *block; - int first; - xfs_bmbt_key_t *kp; - int last; - - block = XFS_BUF_TO_BMBT_BLOCK(bp); - kp = XFS_BMAP_KEY_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - } else { - xfs_inode_t *ip; - - ip = cur->bc_private.b.ip; - xfs_trans_log_inode(tp, ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -/* - * Log pointer values from the btree block. - */ -STATIC void -xfs_bmbt_log_ptrs( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int pfirst, - int plast) -{ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_ptrs"; -#endif - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast); - tp = cur->bc_tp; - if (bp) { - xfs_bmbt_block_t *block; - int first; - int last; - xfs_bmbt_ptr_t *pp; - - block = XFS_BUF_TO_BMBT_BLOCK(bp); - pp = XFS_BMAP_PTR_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - } else { - xfs_inode_t *ip; - - ip = cur->bc_private.b.ip; - xfs_trans_log_inode(tp, ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -/* - * Lookup the record. The cursor is made to point to it, based on dir. - */ -STATIC int /* error */ -xfs_bmbt_lookup( - xfs_btree_cur_t *cur, - xfs_lookup_t dir, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block=NULL; - xfs_buf_t *bp; - xfs_daddr_t d; - xfs_sfiloff_t diff; - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lookup"; -#endif - xfs_fsblock_t fsbno=0; - int high; - int i; - int keyno=0; - xfs_bmbt_key_t *kkbase=NULL; - xfs_bmbt_key_t *kkp; - xfs_bmbt_rec_t *krbase=NULL; - xfs_bmbt_rec_t *krp; - int level; - int low; - xfs_mount_t *mp; - xfs_bmbt_ptr_t *pp; - xfs_bmbt_irec_t *rp; - xfs_fileoff_t startoff; - xfs_trans_t *tp; - - XFS_STATS_INC(xs_bmbt_lookup); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, (int)dir); - tp = cur->bc_tp; - mp = cur->bc_mp; - rp = &cur->bc_rec.b; - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - if (level < cur->bc_nlevels - 1) { - d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; - if (!bp) { - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, - 0, &bp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - xfs_btree_setbuf(cur, level, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, - level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } else - block = XFS_BUF_TO_BMBT_BLOCK(bp); - } else - block = xfs_bmbt_get_block(cur, level, &bp); - if (diff == 0) - keyno = 1; - else { - if (level > 0) - kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur); - else - krbase = XFS_BMAP_REC_IADDR(block, 1, cur); - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - ASSERT(level == 0); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - while (low <= high) { - XFS_STATS_INC(xs_bmbt_compare); - keyno = (low + high) >> 1; - if (level > 0) { - kkp = kkbase + keyno - 1; - startoff = INT_GET(kkp->br_startoff, ARCH_CONVERT); - } else { - krp = krbase + keyno - 1; - startoff = xfs_bmbt_disk_get_startoff(krp); - } - diff = (xfs_sfiloff_t) - (startoff - rp->br_startoff); - if (diff < 0) - low = keyno + 1; - else if (diff > 0) - high = keyno - 1; - else - break; - } - } - if (level > 0) { - if (diff > 0 && --keyno < 1) - keyno = 1; - pp = XFS_BMAP_PTR_IADDR(block, keyno, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - fsbno = INT_GET(*pp, ARCH_CONVERT); - cur->bc_ptrs[level] = keyno; - } - } - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) && - be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) { - cur->bc_ptrs[0] = keyno; - if ((error = xfs_bmbt_increment(cur, 0, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_WANT_CORRUPTED_RETURN(i == 1); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - } else { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - } - return 0; -} - -/* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_bmbt_lshift( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_lshift"; -#endif -#ifdef DEBUG - int i; /* loop counter */ -#endif - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp=NULL; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - int lrecs; /* left record count */ - xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp=NULL; /* right btree key */ - xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */ - xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ - int rrecs; /* right record count */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - if (level == cur->bc_nlevels - 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (cur->bc_ptrs[level] <= 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - mp = cur->bc_mp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0, - &lbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - lrecs = be16_to_cpu(left->bb_numrecs) + 1; - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - *lkp = *rkp; - xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs); - lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*rpp, ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - *lpp = *rpp; /* INT_: direct copy */ - xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs); - } else { - lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - *lrp = *rrp; - xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs); - } - left->bb_numrecs = cpu_to_be16(lrecs); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp); - else - xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp); -#endif - rrecs = be16_to_cpu(right->bb_numrecs) - 1; - right->bb_numrecs = cpu_to_be16(rrecs); - xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i + 1], ARCH_CONVERT), - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(rkp, rkp + 1, rrecs * sizeof(*rkp)); - memmove(rpp, rpp + 1, rrecs * sizeof(*rpp)); - xfs_bmbt_log_keys(cur, rbp, 1, rrecs); - xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs); - } else { - memmove(rrp, rrp + 1, rrecs * sizeof(*rrp)); - xfs_bmbt_log_recs(cur, rbp, 1, rrecs); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); - rkp = &key; - } - if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[level]--; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_bmbt_rshift( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_rshift"; -#endif - int i; /* loop counter */ - xfs_bmbt_key_t key; /* bmap btree key */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_mount_t *mp; /* file system mount point */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */ - struct xfs_btree_cur *tcur; /* temporary btree cursor */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - if (level == cur->bc_nlevels - 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_BMBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - mp = cur->bc_mp; - if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0, - &rbp, XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - right = XFS_BUF_TO_BMBT_BLOCK(rbp); - if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*lpp, ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - *rkp = *lkp; - *rpp = *lpp; /* INT_: direct copy */ - xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - } else { - lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - INT_SET(key.br_startoff, ARCH_CONVERT, - xfs_bmbt_disk_get_startoff(rrp)); - rkp = &key; - } - be16_add(&left->bb_numrecs, -1); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1); - else - xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1); -#endif - xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS); - if ((error = xfs_btree_dup_cursor(cur, &tcur))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_increment(tcur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(tcur, ERROR); - goto error1; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) { - XFS_BMBT_TRACE_CURSOR(tcur, ERROR); - goto error1; - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -error0: - XFS_BMBT_TRACE_CURSOR(cur, ERROR); -error1: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; -} +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Determine the extent state. @@ -1525,343 +54,44 @@ xfs_extent_state( return XFS_EXT_NORM; } - -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_bmbt_split( - xfs_btree_cur_t *cur, - int level, - xfs_fsblock_t *bnop, - xfs_bmbt_key_t *keyp, - xfs_btree_cur_t **curp, - int *stat) /* success/failure */ -{ - xfs_alloc_arg_t args; /* block allocation args */ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_split"; -#endif - int i; /* loop counter */ - xfs_fsblock_t lbno; /* left sibling block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_bmbt_block_t *left; /* left btree block */ - xfs_bmbt_key_t *lkp; /* left btree key */ - xfs_bmbt_ptr_t *lpp; /* left address pointer */ - xfs_bmbt_rec_t *lrp; /* left record pointer */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_bmbt_block_t *right; /* right btree block */ - xfs_bmbt_key_t *rkp; /* right btree key */ - xfs_bmbt_ptr_t *rpp; /* right address pointer */ - xfs_bmbt_block_t *rrblock; /* right-right btree block */ - xfs_buf_t *rrbp; /* right-right buffer pointer */ - xfs_bmbt_rec_t *rrp; /* right record pointer */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, keyp); - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - lbp = cur->bc_bufs[level]; - lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp)); - left = XFS_BUF_TO_BMBT_BLOCK(lbp); - args.fsbno = cur->bc_private.b.firstblock; - args.firstblock = args.fsbno; - if (args.fsbno == NULLFSBLOCK) { - args.fsbno = lbno; - args.type = XFS_ALLOCTYPE_START_BNO; - } else - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.minleft = args.alignment = args.total = args.isfl = - args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; - if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return XFS_ERROR(ENOSPC); - } - if ((error = xfs_alloc_vextent(&args))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (args.fsbno == NULLFSBLOCK) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); - XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, 1L); - rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0); - right = XFS_BUF_TO_BMBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - if (level > 0) { - lkp = XFS_BMAP_KEY_IADDR(left, i, cur); - lpp = XFS_BMAP_PTR_IADDR(left, i, cur); - rkp = XFS_BMAP_KEY_IADDR(right, 1, cur); - rpp = XFS_BMAP_PTR_IADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(lpp[i], ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = INT_GET(rkp->br_startoff, ARCH_CONVERT); - } else { - lrp = XFS_BMAP_REC_IADDR(left, i, cur); - rrp = XFS_BMAP_REC_IADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->br_startoff = xfs_bmbt_disk_get_startoff(rrp); - } - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be64(args.fsbno); - right->bb_leftsib = cpu_to_be64(lbno); - xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS); - xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) { - if ((error = xfs_btree_read_bufl(args.mp, args.tp, - be64_to_cpu(right->bb_rightsib), 0, &rrbp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp); - if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - rrblock->bb_leftsib = cpu_to_be64(args.fsbno); - xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB); - } - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = args.fsbno; - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - - -/* - * Update keys for the record. - */ -STATIC int -xfs_bmbt_updkey( - xfs_btree_cur_t *cur, - xfs_bmbt_key_t *keyp, /* on-disk format */ - int level) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; -#ifdef DEBUG - int error; -#endif -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_updkey"; -#endif - xfs_bmbt_key_t *kp; - int ptr; - - ASSERT(level >= 1); - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGIK(cur, level, keyp); - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_BMAP_KEY_IADDR(block, ptr, cur); - *kp = *keyp; - xfs_bmbt_log_keys(cur, bp, ptr, ptr); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; -} - /* * Convert on-disk form of btree root to in-memory form. */ void xfs_bmdr_to_bmbt( + struct xfs_inode *ip, xfs_bmdr_block_t *dblock, int dblocklen, - xfs_bmbt_block_t *rblock, + struct xfs_btree_block *rblock, int rblocklen) { + struct xfs_mount *mp = ip->i_mount; int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL, + XFS_BMAP_MAGIC, 0, 0, ip->i_ino, + XFS_BTREE_LONG_PTRS); - rblock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC); rblock->bb_level = dblock->bb_level; ASSERT(be16_to_cpu(rblock->bb_level) > 0); rblock->bb_numrecs = dblock->bb_numrecs; - rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO); - rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO); - dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); - tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - fpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); - tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMDR_KEY_ADDR(dblock, 1); + tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); + tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ -} - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_bmbt_decrement( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_decrement"; -#endif - xfs_fsblock_t fsbno; - int lev; - xfs_mount_t *mp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); - if (level < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - if (--cur->bc_ptrs[level] > 0) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - if (lev < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); - } - if (lev == cur->bc_nlevels) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - tp = cur->bc_tp; - mp = cur->bc_mp; - for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Delete the record pointed to by cur. - */ -int /* error */ -xfs_bmbt_delete( - xfs_btree_cur_t *cur, - int *stat) /* success/failure */ -{ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_delete"; -#endif - int i; - int level; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_bmbt_delrec(cur, level, &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_bmbt_decrement(cur, level, - &i))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - break; - } - } - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = i; - return 0; + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -1869,8 +99,7 @@ xfs_bmbt_delete( * This code must be in sync with the routines xfs_bmbt_get_startoff, * xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state. */ - -STATIC __inline__ void +STATIC void __xfs_bmbt_get_all( __uint64_t l0, __uint64_t l1, @@ -1881,25 +110,25 @@ __xfs_bmbt_get_all( ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); s->br_startoff = ((xfs_fileoff_t)l0 & - XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; #if XFS_BIG_BLKNOS - s->br_startblock = (((xfs_fsblock_t)l0 & XFS_MASK64LO(9)) << 43) | + s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | (((xfs_fsblock_t)l1) >> 21); #else #ifdef DEBUG { xfs_dfsbno_t b; - b = (((xfs_dfsbno_t)l0 & XFS_MASK64LO(9)) << 43) | + b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); s->br_startblock = (xfs_fsblock_t)b; } #else /* !DEBUG */ s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21); #endif /* DEBUG */ #endif /* XFS_BIG_BLKNOS */ - s->br_blockcount = (xfs_filblks_t)(l1 & XFS_MASK64LO(21)); + s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); /* This is xfs_extent_state() in-line */ if (ext_flag) { ASSERT(s->br_blockcount != 0); /* saved for DMIG */ @@ -1911,45 +140,20 @@ __xfs_bmbt_get_all( void xfs_bmbt_get_all( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s) { __xfs_bmbt_get_all(r->l0, r->l1, s); } /* - * Get the block pointer for the given level of the cursor. - * Fill in the buffer pointer, if applicable. - */ -xfs_bmbt_block_t * -xfs_bmbt_get_block( - xfs_btree_cur_t *cur, - int level, - xfs_buf_t **bpp) -{ - xfs_ifork_t *ifp; - xfs_bmbt_block_t *rval; - - if (level < cur->bc_nlevels - 1) { - *bpp = cur->bc_bufs[level]; - rval = XFS_BUF_TO_BMBT_BLOCK(*bpp); - } else { - *bpp = NULL; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, - cur->bc_private.b.whichfork); - rval = ifp->if_broot; - } - return rval; -} - -/* * Extract the blockcount field from an in memory bmap extent record. */ xfs_filblks_t xfs_bmbt_get_blockcount( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { - return (xfs_filblks_t)(r->l1 & XFS_MASK64LO(21)); + return (xfs_filblks_t)(r->l1 & xfs_mask64lo(21)); } /* @@ -1957,18 +161,18 @@ xfs_bmbt_get_blockcount( */ xfs_fsblock_t xfs_bmbt_get_startblock( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { #if XFS_BIG_BLKNOS - return (((xfs_fsblock_t)r->l0 & XFS_MASK64LO(9)) << 43) | + return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | (((xfs_fsblock_t)r->l1) >> 21); #else #ifdef DEBUG xfs_dfsbno_t b; - b = (((xfs_dfsbno_t)r->l0 & XFS_MASK64LO(9)) << 43) | + b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) | (((xfs_dfsbno_t)r->l1) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); + ASSERT((b >> 32) == 0 || isnulldstartblock(b)); return (xfs_fsblock_t)b; #else /* !DEBUG */ return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21); @@ -1981,15 +185,15 @@ xfs_bmbt_get_startblock( */ xfs_fileoff_t xfs_bmbt_get_startoff( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { return ((xfs_fileoff_t)r->l0 & - XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } xfs_exntst_t xfs_bmbt_get_state( - xfs_bmbt_rec_t *r) + xfs_bmbt_rec_host_t *r) { int ext_flag; @@ -1998,21 +202,6 @@ xfs_bmbt_get_state( ext_flag); } -#ifndef XFS_NATIVE_HOST -/* Endian flipping versions of the bmbt extraction functions */ -void -xfs_bmbt_disk_get_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - __uint64_t l0, l1; - - l0 = INT_GET(r->l0, ARCH_CONVERT); - l1 = INT_GET(r->l1, ARCH_CONVERT); - - __xfs_bmbt_get_all(l0, l1, s); -} - /* * Extract the blockcount field from an on disk bmap extent record. */ @@ -2020,31 +209,7 @@ xfs_filblks_t xfs_bmbt_disk_get_blockcount( xfs_bmbt_rec_t *r) { - return (xfs_filblks_t)(INT_GET(r->l1, ARCH_CONVERT) & XFS_MASK64LO(21)); -} - -/* - * Extract the startblock field from an on disk bmap extent record. - */ -xfs_fsblock_t -xfs_bmbt_disk_get_startblock( - xfs_bmbt_rec_t *r) -{ -#if XFS_BIG_BLKNOS - return (((xfs_fsblock_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_fsblock_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#else -#ifdef DEBUG - xfs_dfsbno_t b; - - b = (((xfs_dfsbno_t)INT_GET(r->l0, ARCH_CONVERT) & XFS_MASK64LO(9)) << 43) | - (((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); - ASSERT((b >> 32) == 0 || ISNULLDSTARTBLOCK(b)); - return (xfs_fsblock_t)b; -#else /* !DEBUG */ - return (xfs_fsblock_t)(((xfs_dfsbno_t)INT_GET(r->l1, ARCH_CONVERT)) >> 21); -#endif /* DEBUG */ -#endif /* XFS_BIG_BLKNOS */ + return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); } /* @@ -2054,562 +219,142 @@ xfs_fileoff_t xfs_bmbt_disk_get_startoff( xfs_bmbt_rec_t *r) { - return ((xfs_fileoff_t)INT_GET(r->l0, ARCH_CONVERT) & - XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; -} - -xfs_exntst_t -xfs_bmbt_disk_get_state( - xfs_bmbt_rec_t *r) -{ - int ext_flag; - - ext_flag = (int)((INT_GET(r->l0, ARCH_CONVERT)) >> (64 - BMBT_EXNTFLAG_BITLEN)); - return xfs_extent_state(xfs_bmbt_disk_get_blockcount(r), - ext_flag); -} -#endif /* XFS_NATIVE_HOST */ - - -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_bmbt_increment( - xfs_btree_cur_t *cur, - int level, - int *stat) /* success/failure */ -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_increment"; -#endif - xfs_fsblock_t fsbno; - int lev; - xfs_mount_t *mp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGI(cur, level); - ASSERT(level < cur->bc_nlevels); - if (level < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - block = xfs_bmbt_get_block(cur, level, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, level, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; - } - if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - block = xfs_bmbt_get_block(cur, lev, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - if (lev < cur->bc_nlevels - 1) - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - if (lev == cur->bc_nlevels) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - tp = cur->bc_tp; - mp = cur->bc_mp; - for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) { - fsbno = INT_GET(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur), ARCH_CONVERT); - if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp, - XFS_BMAP_BTREE_REF))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_BMBT_BLOCK(bp); - if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - cur->bc_ptrs[lev] = 1; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 1; - return 0; -} - -/* - * Insert the current record at the point referenced by cur. - */ -int /* error */ -xfs_bmbt_insert( - xfs_btree_cur_t *cur, - int *stat) /* success/failure */ -{ - int error; /* error return value */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_insert"; -#endif - int i; - int level; - xfs_fsblock_t nbno; - xfs_btree_cur_t *ncur; - xfs_bmbt_rec_t nrec; - xfs_btree_cur_t *pcur; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = 0; - nbno = NULLFSBLOCK; - xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); - ncur = (xfs_btree_cur_t *)0; - pcur = cur; - do { - if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - cur->bc_private.b.allocated += - pcur->bc_private.b.allocated; - pcur->bc_private.b.allocated = 0; - ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) || - (cur->bc_private.b.ip->i_d.di_flags & - XFS_DIFLAG_REALTIME)); - cur->bc_private.b.firstblock = - pcur->bc_private.b.firstblock; - ASSERT(cur->bc_private.b.flist == - pcur->bc_private.b.flist); - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - if (ncur) { - pcur = ncur; - ncur = (xfs_btree_cur_t *)0; - } - } while (nbno != NULLFSBLOCK); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = i; - return 0; -error0: - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; + return ((xfs_fileoff_t)be64_to_cpu(r->l0) & + xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; } -/* - * Log fields from the btree block header. - */ -void -xfs_bmbt_log_block( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int fields) -{ - int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_block"; -#endif - int last; - xfs_trans_t *tp; - static const short offsets[] = { - offsetof(xfs_bmbt_block_t, bb_magic), - offsetof(xfs_bmbt_block_t, bb_level), - offsetof(xfs_bmbt_block_t, bb_numrecs), - offsetof(xfs_bmbt_block_t, bb_leftsib), - offsetof(xfs_bmbt_block_t, bb_rightsib), - sizeof(xfs_bmbt_block_t) - }; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBI(cur, bp, fields); - tp = cur->bc_tp; - if (bp) { - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, - &last); - xfs_trans_log_buf(tp, bp, first, last); - } else - xfs_trans_log_inode(tp, cur->bc_private.b.ip, - XFS_ILOG_FBROOT(cur->bc_private.b.whichfork)); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} /* - * Log record values from the btree block. + * Set all the fields in a bmap extent record from the arguments. */ void -xfs_bmbt_log_recs( - xfs_btree_cur_t *cur, - xfs_buf_t *bp, - int rfirst, - int rlast) -{ - xfs_bmbt_block_t *block; - int first; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_log_recs"; -#endif - int last; - xfs_bmbt_rec_t *rp; - xfs_trans_t *tp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast); - ASSERT(bp); - tp = cur->bc_tp; - block = XFS_BUF_TO_BMBT_BLOCK(bp); - rp = XFS_BMAP_REC_DADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(tp, bp, first, last); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); -} - -int /* error */ -xfs_bmbt_lookup_eq( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - int *stat) /* success/failure */ -{ - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; - return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat); -} - -int /* error */ -xfs_bmbt_lookup_ge( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - int *stat) /* success/failure */ -{ - cur->bc_rec.b.br_startoff = off; - cur->bc_rec.b.br_startblock = bno; - cur->bc_rec.b.br_blockcount = len; - return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat); -} - -/* - * Give the bmap btree a new root block. Copy the old broot contents - * down into a real block and make the broot point to it. - */ -int /* error */ -xfs_bmbt_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *logflags, /* logging flags for inode */ - int *stat) /* return status - 0 fail */ +xfs_bmbt_set_allf( + xfs_bmbt_rec_host_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) { - xfs_alloc_arg_t args; /* allocation arguments */ - xfs_bmbt_block_t *block; /* bmap btree block */ - xfs_buf_t *bp; /* buffer for block */ - xfs_bmbt_block_t *cblock; /* child btree block */ - xfs_bmbt_key_t *ckp; /* child key pointer */ - xfs_bmbt_ptr_t *cpp; /* child ptr pointer */ - int error; /* error return code */ -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_newroot"; -#endif -#ifdef DEBUG - int i; /* loop counter */ -#endif - xfs_bmbt_key_t *kp; /* pointer to bmap btree key */ - int level; /* btree level */ - xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */ - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - level = cur->bc_nlevels - 1; - block = xfs_bmbt_get_block(cur, level, &bp); - /* - * Copy the root into a real block. - */ - args.mp = cur->bc_mp; - pp = XFS_BMAP_PTR_IADDR(block, 1, cur); - args.tp = cur->bc_tp; - args.fsbno = cur->bc_private.b.firstblock; - args.mod = args.minleft = args.alignment = args.total = args.isfl = - args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; - args.firstblock = args.fsbno; - if (args.fsbno == NULLFSBLOCK) { -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - args.fsbno = INT_GET(*pp, ARCH_CONVERT); - args.type = XFS_ALLOCTYPE_START_BNO; - } else - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - if (args.fsbno == NULLFSBLOCK) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - cur->bc_private.b.firstblock = args.fsbno; - cur->bc_private.b.allocated++; - cur->bc_private.b.ip->i_d.di_nblocks++; - XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip, - XFS_TRANS_DQ_BCOUNT, 1L); - bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0); - cblock = XFS_BUF_TO_BMBT_BLOCK(bp); - *cblock = *block; - be16_add(&block->bb_level, 1); - block->bb_numrecs = cpu_to_be16(1); - cur->bc_nlevels++; - cur->bc_ptrs[level + 1] = 1; - kp = XFS_BMAP_KEY_IADDR(block, 1, cur); - ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur); - memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp)); - cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { - if ((error = xfs_btree_check_lptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - } -#endif - memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp)); -#ifdef DEBUG - if ((error = xfs_btree_check_lptr(cur, (xfs_bmbt_ptr_t)args.fsbno, - level))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - INT_SET(*pp, ARCH_CONVERT, args.fsbno); - xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs), - cur->bc_private.b.whichfork); - xfs_btree_setbuf(cur, level, bp); - /* - * Do all this logging at the end so that - * the root is at the right level. - */ - xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS); - xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs)); - xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs)); - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - *logflags |= - XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork); - *stat = 1; - return 0; -} + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; -/* - * Set all the fields in a bmap extent record from the uncompressed form. - */ -void -xfs_bmbt_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) -{ - int extent_flag; + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((s->br_state == XFS_EXT_NORM) || - (s->br_state == XFS_EXT_UNWRITTEN)); - extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; - ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); - ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); #if XFS_BIG_BLKNOS - ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - ((xfs_bmbt_rec_base_t)s->br_startblock >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(s->br_startblock)) { + if (isnullstartblock(startblock)) { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); - r->l1 = XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + ((xfs_bmbt_rec_base_t)startoff << 9) | + (xfs_bmbt_rec_base_t)xfs_mask64lo(9); + r->l1 = xfs_mask64hi(11) | + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); } else { r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9); - r->l1 = ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + ((xfs_bmbt_rec_base_t)startoff << 9); + r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); } #endif /* XFS_BIG_BLKNOS */ } /* - * Set all the fields in a bmap extent record from the arguments. + * Set all the fields in a bmap extent record from the uncompressed form. */ void -xfs_bmbt_set_allf( - xfs_bmbt_rec_t *r, - xfs_fileoff_t o, - xfs_fsblock_t b, - xfs_filblks_t c, - xfs_exntst_t v) +xfs_bmbt_set_all( + xfs_bmbt_rec_host_t *r, + xfs_bmbt_irec_t *s) { - int extent_flag; - - ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); - extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; - ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); -#if XFS_BIG_BLKNOS - ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - ((xfs_bmbt_rec_base_t)b >> 43); - r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); -#else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(b)) { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); - r->l1 = XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); - } else { - r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9); - r->l1 = ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); - } -#endif /* XFS_BIG_BLKNOS */ + xfs_bmbt_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); } -#ifndef XFS_NATIVE_HOST + /* - * Set all the fields in a bmap extent record from the uncompressed form. + * Set all the fields in a disk format bmap extent record from the arguments. */ void -xfs_bmbt_disk_set_all( - xfs_bmbt_rec_t *r, - xfs_bmbt_irec_t *s) +xfs_bmbt_disk_set_allf( + xfs_bmbt_rec_t *r, + xfs_fileoff_t startoff, + xfs_fsblock_t startblock, + xfs_filblks_t blockcount, + xfs_exntst_t state) { - int extent_flag; + int extent_flag = (state == XFS_EXT_NORM) ? 0 : 1; + + ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); + ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); + ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); - ASSERT((s->br_state == XFS_EXT_NORM) || - (s->br_state == XFS_EXT_UNWRITTEN)); - extent_flag = (s->br_state == XFS_EXT_NORM) ? 0 : 1; - ASSERT((s->br_startoff & XFS_MASK64HI(9)) == 0); - ASSERT((s->br_blockcount & XFS_MASK64HI(43)) == 0); #if XFS_BIG_BLKNOS - ASSERT((s->br_startblock & XFS_MASK64HI(12)) == 0); - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - ((xfs_bmbt_rec_base_t)s->br_startblock >> 43)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); + + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + ((xfs_bmbt_rec_base_t)startblock >> 43)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(s->br_startblock)) { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); - INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + if (isnullstartblock(startblock)) { + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9) | + (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); + r->l1 = cpu_to_be64(xfs_mask64hi(11) | + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); } else { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)s->br_startoff << 9)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)s->br_startblock << 21) | - ((xfs_bmbt_rec_base_t)s->br_blockcount & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); + r->l0 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)extent_flag << 63) | + ((xfs_bmbt_rec_base_t)startoff << 9)); + r->l1 = cpu_to_be64( + ((xfs_bmbt_rec_base_t)startblock << 21) | + ((xfs_bmbt_rec_base_t)blockcount & + (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); } #endif /* XFS_BIG_BLKNOS */ } /* - * Set all the fields in a disk format bmap extent record from the arguments. + * Set all the fields in a bmap extent record from the uncompressed form. */ -void -xfs_bmbt_disk_set_allf( +STATIC void +xfs_bmbt_disk_set_all( xfs_bmbt_rec_t *r, - xfs_fileoff_t o, - xfs_fsblock_t b, - xfs_filblks_t c, - xfs_exntst_t v) + xfs_bmbt_irec_t *s) { - int extent_flag; - - ASSERT((v == XFS_EXT_NORM) || (v == XFS_EXT_UNWRITTEN)); - extent_flag = (v == XFS_EXT_NORM) ? 0 : 1; - ASSERT((o & XFS_MASK64HI(64-BMBT_STARTOFF_BITLEN)) == 0); - ASSERT((c & XFS_MASK64HI(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); -#if XFS_BIG_BLKNOS - ASSERT((b & XFS_MASK64HI(64-BMBT_STARTBLOCK_BITLEN)) == 0); - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - ((xfs_bmbt_rec_base_t)b >> 43)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); -#else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(b)) { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9) | - (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); - INT_SET(r->l1, ARCH_CONVERT, XFS_MASK64HI(11) | - ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); - } else { - INT_SET(r->l0, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)extent_flag << 63) | - ((xfs_bmbt_rec_base_t)o << 9)); - INT_SET(r->l1, ARCH_CONVERT, ((xfs_bmbt_rec_base_t)b << 21) | - ((xfs_bmbt_rec_base_t)c & - (xfs_bmbt_rec_base_t)XFS_MASK64LO(21))); - } -#endif /* XFS_BIG_BLKNOS */ + xfs_bmbt_disk_set_allf(r, s->br_startoff, s->br_startblock, + s->br_blockcount, s->br_state); } -#endif /* XFS_NATIVE_HOST */ /* * Set the blockcount field in a bmap extent record. */ void xfs_bmbt_set_blockcount( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_filblks_t v) { - ASSERT((v & XFS_MASK64HI(43)) == 0); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(43)) | - (xfs_bmbt_rec_base_t)(v & XFS_MASK64LO(21)); + ASSERT((v & xfs_mask64hi(43)) == 0); + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64hi(43)) | + (xfs_bmbt_rec_base_t)(v & xfs_mask64lo(21)); } /* @@ -2617,25 +362,25 @@ xfs_bmbt_set_blockcount( */ void xfs_bmbt_set_startblock( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_fsblock_t v) { #if XFS_BIG_BLKNOS - ASSERT((v & XFS_MASK64HI(12)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64HI(55)) | + ASSERT((v & xfs_mask64hi(12)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | (xfs_bmbt_rec_base_t)(v >> 43); - r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)) | + r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | (xfs_bmbt_rec_base_t)(v << 21); #else /* !XFS_BIG_BLKNOS */ - if (ISNULLSTARTBLOCK(v)) { - r->l0 |= (xfs_bmbt_rec_base_t)XFS_MASK64LO(9); - r->l1 = (xfs_bmbt_rec_base_t)XFS_MASK64HI(11) | + if (isnullstartblock(v)) { + r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9); + r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) | ((xfs_bmbt_rec_base_t)v << 21) | - (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); } else { - r->l0 &= ~(xfs_bmbt_rec_base_t)XFS_MASK64LO(9); + r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9); r->l1 = ((xfs_bmbt_rec_base_t)v << 21) | - (r->l1 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(21)); + (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); } #endif /* XFS_BIG_BLKNOS */ } @@ -2645,13 +390,13 @@ xfs_bmbt_set_startblock( */ void xfs_bmbt_set_startoff( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_fileoff_t v) { - ASSERT((v & XFS_MASK64HI(9)) == 0); - r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) XFS_MASK64HI(1)) | + ASSERT((v & xfs_mask64hi(9)) == 0); + r->l0 = (r->l0 & (xfs_bmbt_rec_base_t) xfs_mask64hi(1)) | ((xfs_bmbt_rec_base_t)v << 9) | - (r->l0 & (xfs_bmbt_rec_base_t)XFS_MASK64LO(9)); + (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64lo(9)); } /* @@ -2659,14 +404,14 @@ xfs_bmbt_set_startoff( */ void xfs_bmbt_set_state( - xfs_bmbt_rec_t *r, + xfs_bmbt_rec_host_t *r, xfs_exntst_t v) { ASSERT(v == XFS_EXT_NORM || v == XFS_EXT_UNWRITTEN); if (v == XFS_EXT_NORM) - r->l0 &= XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN); + r->l0 &= xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN); else - r->l0 |= XFS_MASK64HI(BMBT_EXNTFLAG_BITLEN); + r->l0 |= xfs_mask64hi(BMBT_EXNTFLAG_BITLEN); } /* @@ -2674,79 +419,38 @@ xfs_bmbt_set_state( */ void xfs_bmbt_to_bmdr( - xfs_bmbt_block_t *rblock, + struct xfs_mount *mp, + struct xfs_btree_block *rblock, int rblocklen, xfs_bmdr_block_t *dblock, int dblocklen) { int dmxr; xfs_bmbt_key_t *fkp; - xfs_bmbt_ptr_t *fpp; + __be64 *fpp; xfs_bmbt_key_t *tkp; - xfs_bmbt_ptr_t *tpp; + __be64 *tpp; - ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC); - ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO); - ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO); - ASSERT(be16_to_cpu(rblock->bb_level) > 0); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); + ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)); + ASSERT(rblock->bb_u.l.bb_blkno == + cpu_to_be64(XFS_BUF_DADDR_NULL)); + } else + ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); + ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); + ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); + ASSERT(rblock->bb_level != 0); dblock->bb_level = rblock->bb_level; dblock->bb_numrecs = rblock->bb_numrecs; - dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0); - fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen); - tkp = XFS_BTREE_KEY_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); - fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen); - tpp = XFS_BTREE_PTR_ADDR(dblocklen, xfs_bmdr, dblock, 1, dmxr); + dmxr = xfs_bmdr_maxrecs(dblocklen, 0); + fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1); + tkp = XFS_BMDR_KEY_ADDR(dblock, 1); + fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen); + tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr); dmxr = be16_to_cpu(dblock->bb_numrecs); memcpy(tkp, fkp, sizeof(*fkp) * dmxr); - memcpy(tpp, fpp, sizeof(*fpp) * dmxr); /* INT_: direct copy */ -} - -/* - * Update the record to the passed values. - */ -int -xfs_bmbt_update( - xfs_btree_cur_t *cur, - xfs_fileoff_t off, - xfs_fsblock_t bno, - xfs_filblks_t len, - xfs_exntst_t state) -{ - xfs_bmbt_block_t *block; - xfs_buf_t *bp; - int error; -#ifdef XFS_BMBT_TRACE - static char fname[] = "xfs_bmbt_update"; -#endif - xfs_bmbt_key_t key; - int ptr; - xfs_bmbt_rec_t *rp; - - XFS_BMBT_TRACE_CURSOR(cur, ENTRY); - XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno, - (xfs_dfilblks_t)len, (int)state); - block = xfs_bmbt_get_block(cur, 0, &bp); -#ifdef DEBUG - if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } -#endif - ptr = cur->bc_ptrs[0]; - rp = XFS_BMAP_REC_IADDR(block, ptr, cur); - xfs_bmbt_disk_set_allf(rp, off, bno, len, state); - xfs_bmbt_log_recs(cur, bp, ptr, ptr); - if (ptr > 1) { - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; - } - INT_SET(key.br_startoff, ARCH_CONVERT, off); - if ((error = xfs_bmbt_updkey(cur, &key, 1))) { - XFS_BMBT_TRACE_CURSOR(cur, ERROR); - return error; - } - XFS_BMBT_TRACE_CURSOR(cur, EXIT); - return 0; + memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } /* @@ -2763,10 +467,8 @@ xfs_check_nostate_extents( xfs_extnum_t idx, xfs_extnum_t num) { - xfs_bmbt_rec_t *ep; - for (; num > 0; num--, idx++) { - ep = xfs_iext_get_ext(ifp, idx); + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); if ((ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { ASSERT(0); @@ -2775,3 +477,491 @@ xfs_check_nostate_extents( } return 0; } + + +STATIC struct xfs_btree_cur * +xfs_bmbt_dup_cursor( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_cur *new; + + new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.b.ip, cur->bc_private.b.whichfork); + + /* + * Copy the firstblock, flist, and flags values, + * since init cursor doesn't get them. + */ + new->bc_private.b.firstblock = cur->bc_private.b.firstblock; + new->bc_private.b.flist = cur->bc_private.b.flist; + new->bc_private.b.flags = cur->bc_private.b.flags; + + return new; +} + +STATIC void +xfs_bmbt_update_cursor( + struct xfs_btree_cur *src, + struct xfs_btree_cur *dst) +{ + ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) || + (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME)); + ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist); + + dst->bc_private.b.allocated += src->bc_private.b.allocated; + dst->bc_private.b.firstblock = src->bc_private.b.firstblock; + + src->bc_private.b.allocated = 0; +} + +STATIC int +xfs_bmbt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = cur->bc_private.b.firstblock; + args.firstblock = args.fsbno; + + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; + /* + * Make sure there is sufficient room left in the AG to + * complete a full tree split for an extent insert. If + * we are converting the middle part of an extent then + * we may need space for two tree splits. + * + * We are relying on the caller to make the correct block + * reservation for this operation to succeed. If the + * reservation amount is insufficient then we may fail a + * block allocation here and corrupt the filesystem. + */ + args.minleft = xfs_trans_get_block_res(args.tp); + } else if (cur->bc_private.b.flist->xbf_low) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { + args.type = XFS_ALLOCTYPE_NEAR_BNO; + } + + args.minlen = args.maxlen = args.prod = 1; + args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; + if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { + error = XFS_ERROR(ENOSPC); + goto error0; + } + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + + if (args.fsbno == NULLFSBLOCK && args.minleft) { + /* + * Could not find an AG with enough free space to satisfy + * a full btree split. Try again without minleft and if + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + args.minleft = 0; + error = xfs_alloc_vextent(&args); + if (error) + goto error0; + cur->bc_private.b.flist->xbf_low = 1; + } + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + ASSERT(args.len == 1); + cur->bc_private.b.firstblock = args.fsbno; + cur->bc_private.b.allocated++; + cur->bc_private.b.ip->i_d.di_nblocks++; + xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(args.tp, cur->bc_private.b.ip, + XFS_TRANS_DQ_BCOUNT, 1L); + + new->l = cpu_to_be64(args.fsbno); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + + error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_bmbt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_trans *tp = cur->bc_tp; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + + xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); + ip->i_d.di_nblocks--; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); + xfs_trans_binval(tp, bp); + return 0; +} + +STATIC int +xfs_bmbt_get_minrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0) / 2; + } + + return cur->bc_mp->m_bmap_dmnr[level != 0]; +} + +int +xfs_bmbt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level == cur->bc_nlevels - 1) { + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, + cur->bc_private.b.whichfork); + + return xfs_bmbt_maxrecs(cur->bc_mp, + ifp->if_broot_bytes, level == 0); + } + + return cur->bc_mp->m_bmap_dmxr[level != 0]; + +} + +/* + * Get the maximum records we could store in the on-disk format. + * + * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but + * for the root node this checks the available space in the dinode fork + * so that we can resize the in-memory buffer to match it. After a + * resize to the maximum size this function returns the same value + * as xfs_bmbt_get_maxrecs for the root node, too. + */ +STATIC int +xfs_bmbt_get_dmaxrecs( + struct xfs_btree_cur *cur, + int level) +{ + if (level != cur->bc_nlevels - 1) + return cur->bc_mp->m_bmap_dmxr[level != 0]; + return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0); +} + +STATIC void +xfs_bmbt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->bmbt.br_startoff = + cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); +} + +STATIC void +xfs_bmbt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + ASSERT(key->bmbt.br_startoff != 0); + + xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff), + 0, 0, XFS_EXT_NORM); +} + +STATIC void +xfs_bmbt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b); +} + +STATIC void +xfs_bmbt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + ptr->l = 0; +} + +STATIC __int64_t +xfs_bmbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) - + cur->bc_rec.b.br_startoff; +} + +static bool +xfs_bmbt_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + unsigned int level; + + switch (block->bb_magic) { + case cpu_to_be32(XFS_BMAP_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(block->bb_u.l.bb_blkno) != bp->b_bn) + return false; + /* + * XXX: need a better way of verifying the owner here. Right now + * just make sure there has been one set. + */ + if (be64_to_cpu(block->bb_u.l.bb_owner) == 0) + return false; + /* fall through */ + case cpu_to_be32(XFS_BMAP_MAGIC): + break; + default: + return false; + } + + /* + * numrecs and level verification. + * + * We don't know what fork we belong to, so just verify that the level + * is less than the maximum of the two. Later checks will be more + * precise. + */ + level = be16_to_cpu(block->bb_level); + if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1])) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.l.bb_leftsib || + (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) + return false; + if (!block->bb_u.l.bb_rightsib || + (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && + !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) + return false; + + return true; +} + +static void +xfs_bmbt_read_verify( + struct xfs_buf *bp) +{ + if (!xfs_btree_lblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_bmbt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); + } +} + +static void +xfs_bmbt_write_verify( + struct xfs_buf *bp) +{ + if (!xfs_bmbt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + xfs_btree_lblock_calc_crc(bp); +} + +const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .verify_read = xfs_bmbt_read_verify, + .verify_write = xfs_bmbt_write_verify, +}; + + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_bmbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be64_to_cpu(k1->bmbt.br_startoff) < + be64_to_cpu(k2->bmbt.br_startoff); +} + +STATIC int +xfs_bmbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return xfs_bmbt_disk_get_startoff(&r1->bmbt) + + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= + xfs_bmbt_disk_get_startoff(&r2->bmbt); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bmbt_ops = { + .rec_len = sizeof(xfs_bmbt_rec_t), + .key_len = sizeof(xfs_bmbt_key_t), + + .dup_cursor = xfs_bmbt_dup_cursor, + .update_cursor = xfs_bmbt_update_cursor, + .alloc_block = xfs_bmbt_alloc_block, + .free_block = xfs_bmbt_free_block, + .get_maxrecs = xfs_bmbt_get_maxrecs, + .get_minrecs = xfs_bmbt_get_minrecs, + .get_dmaxrecs = xfs_bmbt_get_dmaxrecs, + .init_key_from_rec = xfs_bmbt_init_key_from_rec, + .init_rec_from_key = xfs_bmbt_init_rec_from_key, + .init_rec_from_cur = xfs_bmbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur, + .key_diff = xfs_bmbt_key_diff, + .buf_ops = &xfs_bmbt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_bmbt_keys_inorder, + .recs_inorder = xfs_bmbt_recs_inorder, +#endif +}; + +/* + * Allocate a new bmap btree cursor. + */ +struct xfs_btree_cur * /* new bmap btree cursor */ +xfs_bmbt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* inode owning the btree */ + int whichfork) /* data or attr fork */ +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_cur *cur; + + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; + cur->bc_btnum = XFS_BTNUM_BMAP; + cur->bc_blocklog = mp->m_sb.sb_blocklog; + + cur->bc_ops = &xfs_bmbt_ops; + cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; + + cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); + cur->bc_private.b.ip = ip; + cur->bc_private.b.firstblock = NULLFSBLOCK; + cur->bc_private.b.flist = NULL; + cur->bc_private.b.allocated = 0; + cur->bc_private.b.flags = 0; + cur->bc_private.b.whichfork = whichfork; + + return cur; +} + +/* + * Calculate number of records in a bmap btree block. + */ +int +xfs_bmbt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) +{ + blocklen -= XFS_BMBT_BLOCK_LEN(mp); + + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + +/* + * Calculate number of records in a bmap btree inode root. + */ +int +xfs_bmdr_maxrecs( + int blocklen, + int leaf) +{ + blocklen -= sizeof(xfs_bmdr_block_t); + + if (leaf) + return blocklen / sizeof(xfs_bmdr_rec_t); + return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t)); +} + +/* + * Change the owner of a btree format fork fo the inode passed in. Change it to + * the owner of that is passed in so that we can change owners before or after + * we switch forks between inodes. The operation that the caller is doing will + * determine whether is needs to change owner before or after the switch. + * + * For demand paged transactional modification, the fork switch should be done + * after reading in all the blocks, modifying them and pinning them in the + * transaction. For modification when the buffers are already pinned in memory, + * the fork switch can be done before changing the owner as we won't need to + * validate the owner until the btree buffers are unpinned and writes can occur + * again. + * + * For recovery based ownership change, there is no transactional context and + * so a buffer list must be supplied so that we can record the buffers that we + * modified for the caller to issue IO on. + */ +int +xfs_bmbt_change_owner( + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_ino_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_cur *cur; + int error; + + ASSERT(tp || buffer_list); + ASSERT(!(tp && buffer_list)); + if (whichfork == XFS_DATA_FORK) + ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE); + else + ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE); + + cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); + if (!cur) + return ENOMEM; + + error = xfs_btree_change_owner(cur, new_owner, buffer_list); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + return error; +} diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h index 6478cfa0e53..819a8a4dee9 100644 --- a/fs/xfs/xfs_bmap_btree.h +++ b/fs/xfs/xfs_bmap_btree.h @@ -18,360 +18,126 @@ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ -#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ - struct xfs_btree_cur; -struct xfs_btree_lblock; +struct xfs_btree_block; struct xfs_mount; struct xfs_inode; - -/* - * Bmap root header, on-disk form only. - */ -typedef struct xfs_bmdr_block { - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ -} xfs_bmdr_block_t; - -/* - * Bmap btree record and extent descriptor. - * For 32-bit kernels, - * l0:31 is an extent flag (value 1 indicates non-normal). - * l0:0-30 and l1:9-31 are startoff. - * l1:0-8, l2:0-31, and l3:21-31 are startblock. - * l3:0-20 are blockcount. - * For 64-bit kernels, - * l0:63 is an extent flag (value 1 indicates non-normal). - * l0:9-62 are startoff. - * l0:0-8 and l1:21-63 are startblock. - * l1:0-20 are blockcount. - */ - -#ifndef XFS_NATIVE_HOST - -#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ -#define BMBT_EXNTFLAG_BITOFF 0 -#define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF + BMBT_EXNTFLAG_BITLEN) -#define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITOFF (BMBT_STARTOFF_BITOFF + BMBT_STARTOFF_BITLEN) -#define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITOFF \ - (BMBT_STARTBLOCK_BITOFF + BMBT_STARTBLOCK_BITLEN) -#define BMBT_BLOCKCOUNT_BITLEN (BMBT_TOTAL_BITLEN - BMBT_BLOCKCOUNT_BITOFF) - -#else - -#define BMBT_TOTAL_BITLEN 128 /* 128 bits, 16 bytes */ -#define BMBT_EXNTFLAG_BITOFF 63 -#define BMBT_EXNTFLAG_BITLEN 1 -#define BMBT_STARTOFF_BITOFF (BMBT_EXNTFLAG_BITOFF - BMBT_STARTOFF_BITLEN) -#define BMBT_STARTOFF_BITLEN 54 -#define BMBT_STARTBLOCK_BITOFF 85 /* 128 - 43 (other 9 is in first word) */ -#define BMBT_STARTBLOCK_BITLEN 52 -#define BMBT_BLOCKCOUNT_BITOFF 64 /* Start of second 64 bit container */ -#define BMBT_BLOCKCOUNT_BITLEN 21 - -#endif /* XFS_NATIVE_HOST */ - - -#define BMBT_USE_64 1 - -typedef struct xfs_bmbt_rec_32 -{ - __uint32_t l0, l1, l2, l3; -} xfs_bmbt_rec_32_t; -typedef struct xfs_bmbt_rec_64 -{ - __uint64_t l0, l1; -} xfs_bmbt_rec_64_t; - -typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ -typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t; - -/* - * Values and macros for delayed-allocation startblock fields. - */ -#define STARTBLOCKVALBITS 17 -#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) -#define DSTARTBLOCKMASKBITS (15 + 20) -#define STARTBLOCKMASK \ - (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) -#define DSTARTBLOCKMASK \ - (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) - -#define ISNULLSTARTBLOCK(x) isnullstartblock(x) -static inline int isnullstartblock(xfs_fsblock_t x) -{ - return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; -} - -#define ISNULLDSTARTBLOCK(x) isnulldstartblock(x) -static inline int isnulldstartblock(xfs_dfsbno_t x) -{ - return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; -} - -#define NULLSTARTBLOCK(k) nullstartblock(k) -static inline xfs_fsblock_t nullstartblock(int k) -{ - ASSERT(k < (1 << STARTBLOCKVALBITS)); - return STARTBLOCKMASK | (k); -} - -#define STARTBLOCKVAL(x) startblockval(x) -static inline xfs_filblks_t startblockval(xfs_fsblock_t x) -{ - return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); -} - -/* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - -/* - * Possible extent states. - */ -typedef enum { - XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID -} xfs_exntst_t; +struct xfs_trans; /* * Extent state and extent format macros. */ #define XFS_EXTFMT_INODE(x) \ - (XFS_SB_VERSION_HASEXTFLGBIT(&((x)->i_mount->m_sb)) ? \ + (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) #define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) /* - * Incore version of above. - */ -typedef struct xfs_bmbt_irec -{ - xfs_fileoff_t br_startoff; /* starting file offset */ - xfs_fsblock_t br_startblock; /* starting block number */ - xfs_filblks_t br_blockcount; /* number of blocks */ - xfs_exntst_t br_state; /* extent state */ -} xfs_bmbt_irec_t; - -/* - * Key structure for non-leaf levels of the tree. + * Btree block header size depends on a superblock flag. */ -typedef struct xfs_bmbt_key -{ - xfs_dfiloff_t br_startoff; /* starting file offset */ -} xfs_bmbt_key_t, xfs_bmdr_key_t; - -typedef xfs_dfsbno_t xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; /* btree pointer type */ - /* btree block header type */ -typedef struct xfs_btree_lblock xfs_bmbt_block_t; - -#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp)) - -#define XFS_BMAP_IBLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) -#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize) -#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \ - ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \ - (cur)->bc_private.b.whichfork)->if_broot_bytes) - -#define XFS_BMAP_BLOCK_DSIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_DSIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) -#define XFS_BMAP_BLOCK_ISIZE(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BMAP_RBLOCK_ISIZE(lev,cur) : XFS_BMAP_IBLOCK_SIZE(lev,cur))) - -#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \ - xfs_bmdr, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) -#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ - xfs_bmbt, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0]))) - -#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\ - xfs_bmdr, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) -#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \ - (((lev) == (cur)->bc_nlevels - 1 ? \ - XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\ - xfs_bmbt, (lev) == 0) : \ - ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0]))) - -#define XFS_BMAP_REC_DADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_REC_IADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) - -#define XFS_BMAP_KEY_DADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_KEY_IADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) - -#define XFS_BMAP_PTR_DADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_DSIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) -#define XFS_BMAP_PTR_IADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_BMAP_BLOCK_ISIZE( \ - be16_to_cpu((bb)->bb_level), cur), \ - xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \ - be16_to_cpu((bb)->bb_level), cur))) +#define XFS_BMBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) + +#define XFS_BMBT_REC_ADDR(mp, block, index) \ + ((xfs_bmbt_rec_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_rec_t))) + +#define XFS_BMBT_KEY_ADDR(mp, block, index) \ + ((xfs_bmbt_key_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_bmbt_key_t))) + +#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_bmbt_ptr_t *) \ + ((char *)(block) + \ + XFS_BMBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_bmbt_key_t) + \ + ((index) - 1) * sizeof(xfs_bmbt_ptr_t))) + +#define XFS_BMDR_REC_ADDR(block, index) \ + ((xfs_bmdr_rec_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_rec_t))) + +#define XFS_BMDR_KEY_ADDR(block, index) \ + ((xfs_bmdr_key_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + ((index) - 1) * sizeof(xfs_bmdr_key_t))) + +#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \ + ((xfs_bmdr_ptr_t *) \ + ((char *)(block) + \ + sizeof(struct xfs_bmdr_block) + \ + (maxrecs) * sizeof(xfs_bmdr_key_t) + \ + ((index) - 1) * sizeof(xfs_bmdr_ptr_t))) /* * These are to be used when we know the size of the block and * we don't have a cursor. */ -#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \ - (XFS_BTREE_REC_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) -#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \ - (XFS_BTREE_KEY_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) -#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \ - (XFS_BTREE_PTR_ADDR(sz,xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz))) - -#define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs) -#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0) +#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \ + XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0)) -#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \ - (int)(sizeof(xfs_bmbt_block_t) + \ +#define XFS_BMAP_BROOT_SPACE_CALC(mp, nrecs) \ + (int)(XFS_BMBT_BLOCK_LEN(mp) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) -#define XFS_BMAP_BROOT_SPACE(bb) \ - (XFS_BMAP_BROOT_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) +#define XFS_BMAP_BROOT_SPACE(mp, bb) \ + (XFS_BMAP_BROOT_SPACE_CALC(mp, be16_to_cpu((bb)->bb_numrecs))) #define XFS_BMDR_SPACE_CALC(nrecs) \ (int)(sizeof(xfs_bmdr_block_t) + \ ((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)))) +#define XFS_BMAP_BMDR_SPACE(bb) \ + (XFS_BMDR_SPACE_CALC(be16_to_cpu((bb)->bb_numrecs))) /* * Maximum number of bmap btree levels. */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) -#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \ - (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \ - be16_to_cpu((bb)->bb_level) == level && \ - be16_to_cpu((bb)->bb_numrecs) > 0 && \ - be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0]) - - -#ifdef __KERNEL__ - -#if defined(XFS_BMBT_TRACE) -/* - * Trace buffer entry types. - */ -#define XFS_BMBT_KTRACE_ARGBI 1 -#define XFS_BMBT_KTRACE_ARGBII 2 -#define XFS_BMBT_KTRACE_ARGFFFI 3 -#define XFS_BMBT_KTRACE_ARGI 4 -#define XFS_BMBT_KTRACE_ARGIFK 5 -#define XFS_BMBT_KTRACE_ARGIFR 6 -#define XFS_BMBT_KTRACE_ARGIK 7 -#define XFS_BMBT_KTRACE_CUR 8 - -#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */ -#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */ -extern ktrace_t *xfs_bmbt_trace_buf; -#endif - /* * Prototypes for xfs_bmap.c to call. */ -extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int); -extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *); -extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *); -extern void xfs_bmbt_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur, - int, struct xfs_buf **bpp); -extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_t *r); -extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_t *r); -extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_t *r); +extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, + struct xfs_btree_block *, int); +extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r); +extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r); +extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); +extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); -#ifndef XFS_NATIVE_HOST -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern xfs_exntst_t xfs_bmbt_disk_get_state(xfs_bmbt_rec_t *r); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fsblock_t xfs_bmbt_disk_get_startblock(xfs_bmbt_rec_t *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -#else -#define xfs_bmbt_disk_get_all(r, s) xfs_bmbt_get_all(r, s) -#define xfs_bmbt_disk_get_state(r) xfs_bmbt_get_state(r) -#define xfs_bmbt_disk_get_blockcount(r) xfs_bmbt_get_blockcount(r) -#define xfs_bmbt_disk_get_startblock(r) xfs_bmbt_get_blockcount(r) -#define xfs_bmbt_disk_get_startoff(r) xfs_bmbt_get_startoff(r) -#endif /* XFS_NATIVE_HOST */ -extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *); -extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *); -extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); -extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, - int); -extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, int *); -extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, int *); - -/* - * Give the bmap btree a new root block. Copy the old broot contents - * down into a real block and make the broot point to it. - */ -extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat); - -extern void xfs_bmbt_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); -extern void xfs_bmbt_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, +extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s); +extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_t *r, xfs_filblks_t v); -extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_t *r, xfs_fsblock_t v); -extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_t *r, xfs_fileoff_t v); -extern void xfs_bmbt_set_state(xfs_bmbt_rec_t *r, xfs_exntst_t v); +extern void xfs_bmbt_set_blockcount(xfs_bmbt_rec_host_t *r, xfs_filblks_t v); +extern void xfs_bmbt_set_startblock(xfs_bmbt_rec_host_t *r, xfs_fsblock_t v); +extern void xfs_bmbt_set_startoff(xfs_bmbt_rec_host_t *r, xfs_fileoff_t v); +extern void xfs_bmbt_set_state(xfs_bmbt_rec_host_t *r, xfs_exntst_t v); -#ifndef XFS_NATIVE_HOST -extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o, xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v); -#else -#define xfs_bmbt_disk_set_all(r, s) xfs_bmbt_set_all(r, s) -#define xfs_bmbt_disk_set_allf(r, o, b, c, v) xfs_bmbt_set_allf(r, o, b, c, v) -#endif /* XFS_NATIVE_HOST */ -extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int); -extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t, - xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t); +extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, + xfs_bmdr_block_t *, int); -#ifdef DEBUG -/* - * Get the data from the pointed-to record. - */ -extern int xfs_bmbt_get_rec(struct xfs_btree_cur *, xfs_fileoff_t *, - xfs_fsblock_t *, xfs_filblks_t *, - xfs_exntst_t *, int *); -#endif +extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); +extern int xfs_bmdr_maxrecs(int blocklen, int leaf); +extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf); + +extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, xfs_ino_t new_owner, + struct list_head *buffer_list); -#endif /* __KERNEL__ */ +extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_inode *, int); #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c new file mode 100644 index 00000000000..64731ef3324 --- /dev/null +++ b/fs/xfs/xfs_bmap_util.c @@ -0,0 +1,1897 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_extfree_item.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_rtalloc.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" +#include "xfs_dinode.h" + +/* Kernel only BMAP related definitions and functions */ + +/* + * Convert the given file system block to a disk block. We have to treat it + * differently based on whether the file is a real time file or not, because the + * bmap code does. + */ +xfs_daddr_t +xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) +{ + return (XFS_IS_REALTIME_INODE(ip) ? \ + (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ + XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); +} + +/* + * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi + * caller. Frees all the extents that need freeing, which must be done + * last due to locking considerations. We never free any extents in + * the first transaction. + * + * Return 1 if the given transaction was committed and a new one + * started, and 0 otherwise in the committed parameter. + */ +int /* error */ +xfs_bmap_finish( + xfs_trans_t **tp, /* transaction pointer addr */ + xfs_bmap_free_t *flist, /* i/o: list extents to free */ + int *committed) /* xact committed or not */ +{ + xfs_efd_log_item_t *efd; /* extent free data */ + xfs_efi_log_item_t *efi; /* extent free intention */ + int error; /* error return value */ + xfs_bmap_free_item_t *free; /* free extent item */ + struct xfs_trans_res tres; /* new log reservation */ + xfs_mount_t *mp; /* filesystem mount structure */ + xfs_bmap_free_item_t *next; /* next item on free list */ + xfs_trans_t *ntp; /* new transaction pointer */ + + ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); + if (flist->xbf_count == 0) { + *committed = 0; + return 0; + } + ntp = *tp; + efi = xfs_trans_get_efi(ntp, flist->xbf_count); + for (free = flist->xbf_first; free; free = free->xbfi_next) + xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + free->xbfi_blockcount); + + tres.tr_logres = ntp->t_log_res; + tres.tr_logcount = ntp->t_log_count; + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + ntp = xfs_trans_dup(*tp); + error = xfs_trans_commit(*tp, 0); + *tp = ntp; + *committed = 1; + /* + * We have a new transaction, so we should return committed=1, + * even though we're returning an error. + */ + if (error) + return error; + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(ntp->t_ticket); + + error = xfs_trans_reserve(ntp, &tres, 0, 0); + if (error) + return error; + efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + for (free = flist->xbf_first; free != NULL; free = next) { + next = free->xbfi_next; + if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + free->xbfi_blockcount))) { + /* + * The bmap free list will be cleaned up at a + * higher level. The EFI will be canceled when + * this transaction is aborted. + * Need to force shutdown here to make sure it + * happens, since this transaction may not be + * dirty yet. + */ + mp = ntp->t_mountp; + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_force_shutdown(mp, + (error == EFSCORRUPTED) ? + SHUTDOWN_CORRUPT_INCORE : + SHUTDOWN_META_IO_ERROR); + return error; + } + xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + free->xbfi_blockcount); + xfs_bmap_del_free(flist, NULL, free); + } + return 0; +} + +int +xfs_bmap_rtalloc( + struct xfs_bmalloca *ap) /* bmap alloc argument struct */ +{ + xfs_alloctype_t atype = 0; /* type for allocation routines */ + int error; /* error return value */ + xfs_mount_t *mp; /* mount point structure */ + xfs_extlen_t prod = 0; /* product factor for allocators */ + xfs_extlen_t ralen = 0; /* realtime allocation length */ + xfs_extlen_t align; /* minimum allocation alignment */ + xfs_rtblock_t rtb; + + mp = ap->ip->i_mount; + align = xfs_get_extsz_hint(ap->ip); + prod = align / mp->m_sb.sb_rextsize; + error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, + align, 1, ap->eof, 0, + ap->conv, &ap->offset, &ap->length); + if (error) + return error; + ASSERT(ap->length); + ASSERT(ap->length % mp->m_sb.sb_rextsize == 0); + + /* + * If the offset & length are not perfectly aligned + * then kill prod, it will just get us in trouble. + */ + if (do_mod(ap->offset, align) || ap->length % align) + prod = 1; + /* + * Set ralen to be the actual requested length in rtextents. + */ + ralen = ap->length / mp->m_sb.sb_rextsize; + /* + * If the old value was close enough to MAXEXTLEN that + * we rounded up to it, cut it back so it's valid again. + * Note that if it's a really large request (bigger than + * MAXEXTLEN), we don't hear about that number, and can't + * adjust the starting point to match it. + */ + if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) + ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; + + /* + * Lock out other modifications to the RT bitmap inode. + */ + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL); + + /* + * If it's an allocation to an empty file at offset 0, + * pick an extent that will space things out in the rt area. + */ + if (ap->eof && ap->offset == 0) { + xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */ + + error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx); + if (error) + return error; + ap->blkno = rtx * mp->m_sb.sb_rextsize; + } else { + ap->blkno = 0; + } + + xfs_bmap_adjacent(ap); + + /* + * Realtime allocation, done through xfs_rtallocate_extent. + */ + atype = ap->blkno == 0 ? XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO; + do_div(ap->blkno, mp->m_sb.sb_rextsize); + rtb = ap->blkno; + ap->length = ralen; + if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length, + &ralen, atype, ap->wasdel, prod, &rtb))) + return error; + if (rtb == NULLFSBLOCK && prod > 1 && + (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, + ap->length, &ralen, atype, + ap->wasdel, 1, &rtb))) + return error; + ap->blkno = rtb; + if (ap->blkno != NULLFSBLOCK) { + ap->blkno *= mp->m_sb.sb_rextsize; + ralen *= mp->m_sb.sb_rextsize; + ap->length = ralen; + ap->ip->i_d.di_nblocks += ralen; + xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); + if (ap->wasdel) + ap->ip->i_delayed_blks -= ralen; + /* + * Adjust the disk quota also. This was reserved + * earlier. + */ + xfs_trans_mod_dquot_byino(ap->tp, ap->ip, + ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : + XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + } else { + ap->length = 0; + } + return 0; +} + +/* + * Check if the endoff is outside the last extent. If so the caller will grow + * the allocation to a stripe unit boundary. All offsets are considered outside + * the end of file for an empty fork, so 1 is returned in *eof in that case. + */ +int +xfs_bmap_eof( + struct xfs_inode *ip, + xfs_fileoff_t endoff, + int whichfork, + int *eof) +{ + struct xfs_bmbt_irec rec; + int error; + + error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof); + if (error || *eof) + return error; + + *eof = endoff >= rec.br_startoff + rec.br_blockcount; + return 0; +} + +/* + * Extent tree block counting routines. + */ + +/* + * Count leaf blocks given a range of extent records. + */ +STATIC void +xfs_bmap_count_leaves( + xfs_ifork_t *ifp, + xfs_extnum_t idx, + int numrecs, + int *count) +{ + int b; + + for (b = 0; b < numrecs; b++) { + xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b); + *count += xfs_bmbt_get_blockcount(frp); + } +} + +/* + * Count leaf blocks given a range of extent records originally + * in btree format. + */ +STATIC void +xfs_bmap_disk_count_leaves( + struct xfs_mount *mp, + struct xfs_btree_block *block, + int numrecs, + int *count) +{ + int b; + xfs_bmbt_rec_t *frp; + + for (b = 1; b <= numrecs; b++) { + frp = XFS_BMBT_REC_ADDR(mp, block, b); + *count += xfs_bmbt_disk_get_blockcount(frp); + } +} + +/* + * Recursively walks each level of a btree + * to count total fsblocks in use. + */ +STATIC int /* error */ +xfs_bmap_count_tree( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fsblock_t blockno, /* file system block number */ + int levelin, /* level in btree */ + int *count) /* Count of blocks */ +{ + int error; + xfs_buf_t *bp, *nbp; + int level = levelin; + __be64 *pp; + xfs_fsblock_t bno = blockno; + xfs_fsblock_t nextbno; + struct xfs_btree_block *block, *nextblock; + int numrecs; + + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + + if (--level) { + /* Not at node above leaves, count this level of nodes */ + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + while (nextbno != NULLFSBLOCK) { + error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + nextblock = XFS_BUF_TO_BLOCK(nbp); + nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib); + xfs_trans_brelse(tp, nbp); + } + + /* Dive to the next level */ + pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); + bno = be64_to_cpu(*pp); + if (unlikely((error = + xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) { + xfs_trans_brelse(tp, bp); + XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", + XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + xfs_trans_brelse(tp, bp); + } else { + /* count all level 1 nodes and their leaves */ + for (;;) { + nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib); + numrecs = be16_to_cpu(block->bb_numrecs); + xfs_bmap_disk_count_leaves(mp, block, numrecs, count); + xfs_trans_brelse(tp, bp); + if (nextbno == NULLFSBLOCK) + break; + bno = nextbno; + error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, + XFS_BMAP_BTREE_REF, + &xfs_bmbt_buf_ops); + if (error) + return error; + *count += 1; + block = XFS_BUF_TO_BLOCK(bp); + } + } + return 0; +} + +/* + * Count fsblocks of the given fork. + */ +int /* error */ +xfs_bmap_count_blocks( + xfs_trans_t *tp, /* transaction pointer */ + xfs_inode_t *ip, /* incore inode */ + int whichfork, /* data or attr fork */ + int *count) /* out: count of blocks */ +{ + struct xfs_btree_block *block; /* current btree block */ + xfs_fsblock_t bno; /* block # of "block" */ + xfs_ifork_t *ifp; /* fork structure */ + int level; /* btree level, for checking */ + xfs_mount_t *mp; /* file system mount structure */ + __be64 *pp; /* pointer to block address */ + + bno = NULLFSBLOCK; + mp = ip->i_mount; + ifp = XFS_IFORK_PTR(ip, whichfork); + if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { + xfs_bmap_count_leaves(ifp, 0, + ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), + count); + return 0; + } + + /* + * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. + */ + block = ifp->if_broot; + level = be16_to_cpu(block->bb_level); + ASSERT(level > 0); + pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); + bno = be64_to_cpu(*pp); + ASSERT(bno != NULLDFSBNO); + ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); + ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); + + if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { + XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, + mp); + return XFS_ERROR(EFSCORRUPTED); + } + + return 0; +} + +/* + * returns 1 for success, 0 if we failed to map the extent. + */ +STATIC int +xfs_getbmapx_fix_eof_hole( + xfs_inode_t *ip, /* xfs incore inode pointer */ + struct getbmapx *out, /* output structure */ + int prealloced, /* this is a file with + * preallocated data space */ + __int64_t end, /* last block requested */ + xfs_fsblock_t startblock) +{ + __int64_t fixlen; + xfs_mount_t *mp; /* file system mount point */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t lastx; /* last extent pointer */ + xfs_fileoff_t fileblock; + + if (startblock == HOLESTARTBLOCK) { + mp = ip->i_mount; + out->bmv_block = -1; + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip))); + fixlen -= out->bmv_offset; + if (prealloced && out->bmv_offset + out->bmv_length == end) { + /* Came to hole at EOF. Trim it. */ + if (fixlen <= 0) + return 0; + out->bmv_length = fixlen; + } + } else { + if (startblock == DELAYSTARTBLOCK) + out->bmv_block = -2; + else + out->bmv_block = xfs_fsb_to_db(ip, startblock); + fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && + (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + out->bmv_oflags |= BMV_OF_LAST; + } + + return 1; +} + +/* + * Get inode's extents as described in bmv, and format for output. + * Calls formatter to fill the user's buffer until all extents + * are mapped, until the passed-in bmv->bmv_count slots have + * been filled, or until the formatter short-circuits the loop, + * if it is tracking filled-in extents on its own. + */ +int /* error code */ +xfs_getbmap( + xfs_inode_t *ip, + struct getbmapx *bmv, /* user bmap structure */ + xfs_bmap_format_t formatter, /* format to user */ + void *arg) /* formatter arg */ +{ + __int64_t bmvend; /* last block requested */ + int error = 0; /* return value */ + __int64_t fixlen; /* length for -1 case */ + int i; /* extent number */ + int lock; /* lock state */ + xfs_bmbt_irec_t *map; /* buffer for user's data */ + xfs_mount_t *mp; /* file system mount point */ + int nex; /* # of user extents can do */ + int nexleft; /* # of user extents left */ + int subnex; /* # of bmapi's can do */ + int nmap; /* number of map entries */ + struct getbmapx *out; /* output structure */ + int whichfork; /* data or attr fork */ + int prealloced; /* this is a file with + * preallocated data space */ + int iflags; /* interface flags */ + int bmapi_flags; /* flags for xfs_bmapi */ + int cur_ext = 0; + + mp = ip->i_mount; + iflags = bmv->bmv_iflags; + whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; + + if (whichfork == XFS_ATTR_FORK) { + if (XFS_IFORK_Q(ip)) { + if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && + ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + } else if (unlikely( + ip->i_d.di_aformat != 0 && + ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { + XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + + prealloced = 0; + fixlen = 1LL << 32; + } else { + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) + return XFS_ERROR(EINVAL); + + if (xfs_get_extsz_hint(ip) || + ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ + prealloced = 1; + fixlen = mp->m_super->s_maxbytes; + } else { + prealloced = 0; + fixlen = XFS_ISIZE(ip); + } + } + + if (bmv->bmv_length == -1) { + fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen)); + bmv->bmv_length = + max_t(__int64_t, fixlen - bmv->bmv_offset, 0); + } else if (bmv->bmv_length == 0) { + bmv->bmv_entries = 0; + return 0; + } else if (bmv->bmv_length < 0) { + return XFS_ERROR(EINVAL); + } + + nex = bmv->bmv_count - 1; + if (nex <= 0) + return XFS_ERROR(EINVAL); + bmvend = bmv->bmv_offset + bmv->bmv_length; + + + if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) + return XFS_ERROR(ENOMEM); + out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); + if (!out) + return XFS_ERROR(ENOMEM); + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + if (whichfork == XFS_DATA_FORK) { + if (!(iflags & BMV_IF_DELALLOC) && + (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { + error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + if (error) + goto out_unlock_iolock; + + /* + * Even after flushing the inode, there can still be + * delalloc blocks on the inode beyond EOF due to + * speculative preallocation. These are not removed + * until the release function is called or the inode + * is inactivated. Hence we cannot assert here that + * ip->i_delayed_blks == 0. + */ + } + + lock = xfs_ilock_data_map_shared(ip); + } else { + lock = xfs_ilock_attr_map_shared(ip); + } + + /* + * Don't let nex be bigger than the number of extents + * we can have assuming alternating holes and real extents. + */ + if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1) + nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1; + + bmapi_flags = xfs_bmapi_aflag(whichfork); + if (!(iflags & BMV_IF_PREALLOC)) + bmapi_flags |= XFS_BMAPI_IGSTATE; + + /* + * Allocate enough space to handle "subnex" maps at a time. + */ + error = ENOMEM; + subnex = 16; + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); + if (!map) + goto out_unlock_ilock; + + bmv->bmv_entries = 0; + + if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 && + (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) { + error = 0; + goto out_free_map; + } + + nexleft = nex; + + do { + nmap = (nexleft > subnex) ? subnex : nexleft; + error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), + XFS_BB_TO_FSB(mp, bmv->bmv_length), + map, &nmap, bmapi_flags); + if (error) + goto out_free_map; + ASSERT(nmap <= subnex); + + for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) { + out[cur_ext].bmv_oflags = 0; + if (map[i].br_state == XFS_EXT_UNWRITTEN) + out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; + else if (map[i].br_startblock == DELAYSTARTBLOCK) + out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC; + out[cur_ext].bmv_offset = + XFS_FSB_TO_BB(mp, map[i].br_startoff); + out[cur_ext].bmv_length = + XFS_FSB_TO_BB(mp, map[i].br_blockcount); + out[cur_ext].bmv_unused1 = 0; + out[cur_ext].bmv_unused2 = 0; + + /* + * delayed allocation extents that start beyond EOF can + * occur due to speculative EOF allocation when the + * delalloc extent is larger than the largest freespace + * extent at conversion time. These extents cannot be + * converted by data writeback, so can exist here even + * if we are not supposed to be finding delalloc + * extents. + */ + if (map[i].br_startblock == DELAYSTARTBLOCK && + map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip))) + ASSERT((iflags & BMV_IF_DELALLOC) != 0); + + if (map[i].br_startblock == HOLESTARTBLOCK && + whichfork == XFS_ATTR_FORK) { + /* came to the end of attribute fork */ + out[cur_ext].bmv_oflags |= BMV_OF_LAST; + goto out_free_map; + } + + if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext], + prealloced, bmvend, + map[i].br_startblock)) + goto out_free_map; + + bmv->bmv_offset = + out[cur_ext].bmv_offset + + out[cur_ext].bmv_length; + bmv->bmv_length = + max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; + bmv->bmv_entries++; + cur_ext++; + } + } while (nmap && nexleft && bmv->bmv_length); + + out_free_map: + kmem_free(map); + out_unlock_ilock: + xfs_iunlock(ip, lock); + out_unlock_iolock: + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + + for (i = 0; i < cur_ext; i++) { + int full = 0; /* user array is full */ + + /* format results & advance arg */ + error = formatter(&arg, &out[i], &full); + if (error || full) + break; + } + + kmem_free(out); + return error; +} + +/* + * dead simple method of punching delalyed allocation blocks from a range in + * the inode. Walks a block at a time so will be slow, but is only executed in + * rare error cases so the overhead is not critical. This will always punch out + * both the start and end blocks, even if the ranges only partially overlap + * them, so it is up to the caller to ensure that partial blocks are not + * passed in. + */ +int +xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, + xfs_fileoff_t start_fsb, + xfs_fileoff_t length) +{ + xfs_fileoff_t remaining = length; + int error = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + do { + int done; + xfs_bmbt_irec_t imap; + int nimaps = 1; + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + + /* + * Map the range first and check that it is a delalloc extent + * before trying to unmap the range. Otherwise we will be + * trying to remove a real extent (which requires a + * transaction) or a hole, which is probably a bad idea... + */ + error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps, + XFS_BMAPI_ENTIRE); + + if (error) { + /* something screwed, just bail */ + if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_alert(ip->i_mount, + "Failed delalloc mapping lookup ino %lld fsb %lld.", + ip->i_ino, start_fsb); + } + break; + } + if (!nimaps) { + /* nothing there */ + goto next_block; + } + if (imap.br_startblock != DELAYSTARTBLOCK) { + /* been converted, ignore */ + goto next_block; + } + WARN_ON(imap.br_blockcount == 0); + + /* + * Note: while we initialise the firstblock/flist pair, they + * should never be used because blocks should never be + * allocated or freed for a delalloc extent and hence we need + * don't cancel or finish them after the xfs_bunmapi() call. + */ + xfs_bmap_init(&flist, &firstblock); + error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock, + &flist, &done); + if (error) + break; + + ASSERT(!flist.xbf_count && !flist.xbf_first); +next_block: + start_fsb++; + remaining--; + } while(remaining > 0); + + return error; +} + +/* + * Test whether it is appropriate to check an inode for and free post EOF + * blocks. The 'force' parameter determines whether we should also consider + * regular files that are marked preallocated or append-only. + */ +bool +xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) +{ + /* prealloc/delalloc exists only on regular files */ + if (!S_ISREG(ip->i_d.di_mode)) + return false; + + /* + * Zero sized files with no cached pages and delalloc blocks will not + * have speculative prealloc/delalloc blocks to remove. + */ + if (VFS_I(ip)->i_size == 0 && + VN_CACHED(VFS_I(ip)) == 0 && + ip->i_delayed_blks == 0) + return false; + + /* If we haven't read in the extent list, then don't do it now. */ + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return false; + + /* + * Do not free real preallocated or append-only files unless the file + * has delalloc blocks and we are forced to remove them. + */ + if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) + if (!force || ip->i_delayed_blks == 0) + return false; + + return true; +} + +/* + * This is called by xfs_inactive to free any blocks beyond eof + * when the link count isn't zero and by xfs_dm_punch_hole() when + * punching a hole to EOF. + */ +int +xfs_free_eofblocks( + xfs_mount_t *mp, + xfs_inode_t *ip, + bool need_iolock) +{ + xfs_trans_t *tp; + int error; + xfs_fileoff_t end_fsb; + xfs_fileoff_t last_fsb; + xfs_filblks_t map_len; + int nimaps; + xfs_bmbt_irec_t imap; + + /* + * Figure out if there are any blocks beyond the end + * of the file. If not, then there is nothing to do. + */ + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip)); + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (last_fsb <= end_fsb) + return 0; + map_len = last_fsb - end_fsb; + + nimaps = 1; + xfs_ilock(ip, XFS_ILOCK_SHARED); + error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!error && (nimaps != 0) && + (imap.br_startblock != HOLESTARTBLOCK || + ip->i_delayed_blks)) { + /* + * Attach the dquots to the inode up front. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * There are blocks after the end of file. + * Free them up now by truncating the file to + * its current size. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + if (need_iolock) { + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { + xfs_trans_cancel(tp, 0); + return EAGAIN; + } + } + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Do not update the on-disk file size. If we update the + * on-disk file size and then the system crashes before the + * contents of the file are flushed to disk then the files + * may be full of holes (ie NULL files bug). + */ + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip)); + if (error) { + /* + * If we get an error at this point we simply don't + * bother truncating the file. + */ + xfs_trans_cancel(tp, + (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + } else { + error = xfs_trans_commit(tp, + XFS_TRANS_RELEASE_LOG_RES); + if (!error) + xfs_inode_clear_eofblocks_tag(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (need_iolock) + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } + return error; +} + +int +xfs_alloc_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + int alloc_type) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_off_t count; + xfs_filblks_t allocated_fsb; + xfs_filblks_t allocatesize_fsb; + xfs_extlen_t extsz, temp; + xfs_fileoff_t startoffset_fsb; + xfs_fsblock_t firstfsb; + int nimaps; + int quota_flag; + int rt; + xfs_trans_t *tp; + xfs_bmbt_irec_t imaps[1], *imapp; + xfs_bmap_free_t free_list; + uint qblocks, resblks, resrtextents; + int committed; + int error; + + trace_xfs_alloc_file_space(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + if (len <= 0) + return XFS_ERROR(EINVAL); + + rt = XFS_IS_REALTIME_INODE(ip); + extsz = xfs_get_extsz_hint(ip); + + count = len; + imapp = &imaps[0]; + nimaps = 1; + startoffset_fsb = XFS_B_TO_FSBT(mp, offset); + allocatesize_fsb = XFS_B_TO_FSB(mp, count); + + /* + * Allocate file space until done or until there is an error + */ + while (allocatesize_fsb && !error) { + xfs_fileoff_t s, e; + + /* + * Determine space reservations for data/realtime. + */ + if (unlikely(extsz)) { + s = startoffset_fsb; + do_div(s, extsz); + s *= extsz; + e = startoffset_fsb + allocatesize_fsb; + if ((temp = do_mod(startoffset_fsb, extsz))) + e += temp; + if ((temp = do_mod(e, extsz))) + e += extsz - temp; + } else { + s = 0; + e = allocatesize_fsb; + } + + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); + if (unlikely(rt)) { + resrtextents = qblocks = resblks; + resrtextents /= mp->m_sb.sb_rextsize; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); + quota_flag = XFS_QMOPT_RES_REGBLKS; + } + + /* + * Allocate and setup the transaction. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); + /* + * Check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, + 0, quota_flag); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bmapi_write(tp, ip, startoffset_fsb, + allocatesize_fsb, alloc_type, &firstfsb, + 0, imapp, &nimaps, &free_list); + if (error) { + goto error0; + } + + /* + * Complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + break; + } + + allocated_fsb = imapp->br_blockcount; + + if (nimaps == 0) { + error = XFS_ERROR(ENOSPC); + break; + } + + startoffset_fsb += allocated_fsb; + allocatesize_fsb -= allocated_fsb; + } + + return error; + +error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); + +error1: /* Just cancel transaction */ + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * Zero file bytes between startoff and endoff inclusive. + * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. + */ +STATIC int +xfs_zero_remaining_bytes( + xfs_inode_t *ip, + xfs_off_t startoff, + xfs_off_t endoff) +{ + xfs_bmbt_irec_t imap; + xfs_fileoff_t offset_fsb; + xfs_off_t lastoffset; + xfs_off_t offset; + xfs_buf_t *bp; + xfs_mount_t *mp = ip->i_mount; + int nimap; + int error = 0; + + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= XFS_ISIZE(ip)) + return 0; + + if (endoff > XFS_ISIZE(ip)) + endoff = XFS_ISIZE(ip); + + bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + BTOBB(mp->m_sb.sb_blocksize), 0); + if (!bp) + return XFS_ERROR(ENOMEM); + + xfs_buf_unlock(bp); + + for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { + uint lock_mode; + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + nimap = 1; + + lock_mode = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); + xfs_iunlock(ip, lock_mode); + + if (error || nimap < 1) + break; + ASSERT(imap.br_blockcount >= 1); + ASSERT(imap.br_startoff == offset_fsb); + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; + if (lastoffset > endoff) + lastoffset = endoff; + if (imap.br_startblock == HOLESTARTBLOCK) + continue; + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + if (imap.br_state == XFS_EXT_UNWRITTEN) + continue; + XFS_BUF_UNDONE(bp); + XFS_BUF_UNWRITE(bp); + XFS_BUF_READ(bp); + XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(read)"); + break; + } + memset(bp->b_addr + + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + XFS_BUF_UNDONE(bp); + XFS_BUF_UNREAD(bp); + XFS_BUF_WRITE(bp); + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + break; + } + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, + "xfs_zero_remaining_bytes(write)"); + break; + } + } + xfs_buf_free(bp); + return error; +} + +int +xfs_free_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int committed; + int done; + xfs_fileoff_t endoffset_fsb; + int error; + xfs_fsblock_t firstfsb; + xfs_bmap_free_t free_list; + xfs_bmbt_irec_t imap; + xfs_off_t ioffset; + xfs_extlen_t mod=0; + xfs_mount_t *mp; + int nimap; + uint resblks; + xfs_off_t rounding; + int rt; + xfs_fileoff_t startoffset_fsb; + xfs_trans_t *tp; + + mp = ip->i_mount; + + trace_xfs_free_file_space(ip); + + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + error = 0; + if (len <= 0) /* if nothing being freed */ + return error; + rt = XFS_IS_REALTIME_INODE(ip); + startoffset_fsb = XFS_B_TO_FSB(mp, offset); + endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len); + + /* wait for the completion of any pending DIOs */ + inode_dio_wait(VFS_I(ip)); + + rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + ioffset = offset & ~(rounding - 1); + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ioffset, -1); + if (error) + goto out; + truncate_pagecache_range(VFS_I(ip), ioffset, -1); + + /* + * Need to zero the stuff we're not freeing, on disk. + * If it's a realtime file & can't use unwritten extents then we + * actually need to zero the extent edges. Otherwise xfs_bunmapi + * will take care of it for us. + */ + if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { + nimap = 1; + error = xfs_bmapi_read(ip, startoffset_fsb, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + xfs_daddr_t block; + + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + block = imap.br_startblock; + mod = do_div(block, mp->m_sb.sb_rextsize); + if (mod) + startoffset_fsb += mp->m_sb.sb_rextsize - mod; + } + nimap = 1; + error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1, + &imap, &nimap, 0); + if (error) + goto out; + ASSERT(nimap == 0 || nimap == 1); + if (nimap && imap.br_startblock != HOLESTARTBLOCK) { + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + mod++; + if (mod && (mod != mp->m_sb.sb_rextsize)) + endoffset_fsb -= mod; + } + } + if ((done = (endoffset_fsb <= startoffset_fsb))) + /* + * One contiguous piece to clear + */ + error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); + else { + /* + * Some full blocks, possibly two pieces to clear + */ + if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) + error = xfs_zero_remaining_bytes(ip, offset, + XFS_FSB_TO_B(mp, startoffset_fsb) - 1); + if (!error && + XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) + error = xfs_zero_remaining_bytes(ip, + XFS_FSB_TO_B(mp, endoffset_fsb), + offset + len - 1); + } + + /* + * free file space until done or until there is an error + */ + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + while (!error && !done) { + + /* + * allocate and setup the transaction. Allow this + * transaction to dip into the reserve blocks to ensure + * the freeing of the space succeeds at ENOSPC. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); + + /* + * check for running out of space + */ + if (error) { + /* + * Free the transaction structure. + */ + ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + break; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, + ip->i_udquot, ip->i_gdquot, ip->i_pdquot, + resblks, 0, XFS_QMOPT_RES_REGBLKS); + if (error) + goto error1; + + xfs_trans_ijoin(tp, ip, 0); + + /* + * issue the bunmapi() call to free the blocks + */ + xfs_bmap_init(&free_list, &firstfsb); + error = xfs_bunmapi(tp, ip, startoffset_fsb, + endoffset_fsb - startoffset_fsb, + 0, 2, &firstfsb, &free_list, &done); + if (error) { + goto error0; + } + + /* + * complete the transaction + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error0; + } + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + out: + return error; + + error0: + xfs_bmap_cancel(&free_list); + error1: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + goto out; +} + + +int +xfs_zero_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + struct xfs_mount *mp = ip->i_mount; + uint granularity; + xfs_off_t start_boundary; + xfs_off_t end_boundary; + int error; + + trace_xfs_zero_file_space(ip); + + granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); + + /* + * Round the range of extents we are going to convert inwards. If the + * offset is aligned, then it doesn't get changed so we zero from the + * start of the block offset points to. + */ + start_boundary = round_up(offset, granularity); + end_boundary = round_down(offset + len, granularity); + + ASSERT(start_boundary >= offset); + ASSERT(end_boundary <= offset + len); + + if (start_boundary < end_boundary - 1) { + /* + * punch out delayed allocation blocks and the page cache over + * the conversion range + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, start_boundary), + XFS_B_TO_FSB(mp, end_boundary - start_boundary)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + truncate_pagecache_range(VFS_I(ip), start_boundary, + end_boundary - 1); + + /* convert the blocks */ + error = xfs_alloc_file_space(ip, start_boundary, + end_boundary - start_boundary - 1, + XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT); + if (error) + goto out; + + /* We've handled the interior of the range, now for the edges */ + if (start_boundary != offset) { + error = xfs_iozero(ip, offset, start_boundary - offset); + if (error) + goto out; + } + + if (end_boundary != offset + len) + error = xfs_iozero(ip, end_boundary, + offset + len - end_boundary); + + } else { + /* + * It's either a sub-granularity range or the range spanned lies + * partially across two adjacent blocks. + */ + error = xfs_iozero(ip, offset, len); + } + +out: + return error; + +} + +/* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int done = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + xfs_extnum_t current_ext = 0; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + int committed; + xfs_fileoff_t start_fsb; + xfs_fileoff_t shift_fsb; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + + trace_xfs_collapse_file_space(ip); + + start_fsb = XFS_B_TO_FSB(mp, offset + len); + shift_fsb = XFS_B_TO_FSB(mp, len); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + while (!error && !done) { + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + /* + * We would need to reserve permanent block for transaction. + * This will come into picture when after shifting extent into + * hole we found that adjacent extents can be merged which + * may lead to freeing of a block during record update. + */ + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp, 0); + break; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, + ip->i_gdquot, ip->i_pdquot, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, + XFS_QMOPT_RES_REGBLKS); + if (error) + goto out; + + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + + /* + * We are using the write transaction in which max 2 bmbt + * updates are allowed + */ + error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, + shift_fsb, ¤t_ext, + &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + return error; + +out: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(ip) && + XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) + return EINVAL; + } + + /* Reciprocal target->temp btree format checks */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + if (XFS_IFORK_BOFF(tip) && + XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) + return EINVAL; + if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= + XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) + return EINVAL; + } + + return 0; +} + +int +xfs_swap_extents( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ + xfs_swapext_t *sxp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + xfs_bstat_t *sbp = &sxp->sx_stat; + xfs_ifork_t *tempifp, *ifp, *tifp; + int src_log_flags, target_log_flags; + int error = 0; + int aforkblks = 0; + int taforkblks = 0; + __uint64_t tmp; + + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); + if (!tempifp) { + error = XFS_ERROR(ENOMEM); + goto out; + } + + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* Verify that both files have the same format */ + if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify both files are either real-time or non-realtime */ + if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); + if (error) + goto out_unlock; + truncate_pagecache_range(VFS_I(tip), 0, -1); + + /* Verify O_DIRECT for ftmp */ + if (VN_CACHED(VFS_I(tip)) != 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* Verify all data are being swapped */ + if (sxp->sx_offset != 0 || + sxp->sx_length != ip->i_d.di_size || + sxp->sx_length != tip->i_d.di_size) { + error = XFS_ERROR(EFAULT); + goto out_unlock; + } + + trace_xfs_swap_extent_before(ip, 0); + trace_xfs_swap_extent_before(tip, 1); + + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_notice(mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __func__, ip->i_ino); + goto out_unlock; + } + + /* + * Compare the current change & modify times with that + * passed in. If they differ, we abort this swap. + * This is the mechanism used to ensure the calling + * process that the file was not changed out from + * under it. + */ + if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + /* We need to fail if the file is memory mapped. Once we have tossed + * all existing pages, the page fault will have no option + * but to go to the filesystem for pages. By making the page fault call + * vop_read (or write in the case of autogrow) they block on the iolock + * until we have switched the extents. + */ + if (VN_MAPPED(VFS_I(ip))) { + error = XFS_ERROR(EBUSY); + goto out_unlock; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); + + /* + * There is a race condition here since we gave up the + * ilock. However, the data fork will not change since + * we have the iolock (locked for truncation too) so we + * are safe. We don't really care if non-io related + * fields change. + */ + truncate_pagecache_range(VFS_I(ip), 0, -1); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_IOLOCK_EXCL); + xfs_trans_cancel(tp, 0); + goto out; + } + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); + + /* + * Count the number of extended attribute blocks + */ + if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && + (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); + if (error) + goto out_trans_cancel; + } + if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && + (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { + error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, + &taforkblks); + if (error) + goto out_trans_cancel; + } + + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + + /* + * Before we've swapped the forks, lets set the owners of the forks + * appropriately. We have to do this as we are demand paging the btree + * buffers, and so the validation done on read will expect the owner + * field to be correctly set. Once we change the owners, we can swap the + * inode forks. + * + * Note the trickiness in setting the log flags - we set the owner log + * flag on the opposite inode (i.e. the inode we are setting the new + * owner to be) because once we swap the forks and log that, log + * recovery is going to see the fork as owned by the swapped inode, + * not the pre-swapped inodes. + */ + src_log_flags = XFS_ILOG_CORE; + target_log_flags = XFS_ILOG_CORE; + if (ip->i_d.di_version == 3 && + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + target_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, + tip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + if (tip->i_d.di_version == 3 && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + src_log_flags |= XFS_ILOG_DOWNER; + error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, + ip->i_ino, NULL); + if (error) + goto out_trans_cancel; + } + + /* + * Swap the data forks of the inodes + */ + ifp = &ip->i_df; + tifp = &tip->i_df; + *tempifp = *ifp; /* struct copy */ + *ifp = *tifp; /* struct copy */ + *tifp = *tempifp; /* struct copy */ + + /* + * Fix the on-disk inode values + */ + tmp = (__uint64_t)ip->i_d.di_nblocks; + ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; + tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; + + tmp = (__uint64_t) ip->i_d.di_nextents; + ip->i_d.di_nextents = tip->i_d.di_nextents; + tip->i_d.di_nextents = tmp; + + tmp = (__uint64_t) ip->i_d.di_format; + ip->i_d.di_format = tip->i_d.di_format; + tip->i_d.di_format = tmp; + + /* + * The extents in the source inode could still contain speculative + * preallocation beyond EOF (e.g. the file is open but not modified + * while defrag is in progress). In that case, we need to copy over the + * number of delalloc blocks the data fork in the source inode is + * tracking beyond EOF so that when the fork is truncated away when the + * temporary inode is unlinked we don't underrun the i_delayed_blks + * counter on that inode. + */ + ASSERT(tip->i_delayed_blks == 0); + tip->i_delayed_blks = ip->i_delayed_blks; + ip->i_delayed_blks = 0; + + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { + ifp->if_u1.if_extents = + ifp->if_u2.if_inline_ext; + } + src_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + ASSERT(ip->i_d.di_version < 3 || + (src_log_flags & XFS_ILOG_DOWNER)); + src_log_flags |= XFS_ILOG_DBROOT; + break; + } + + switch (tip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + /* If the extents fit in the inode, fix the + * pointer. Otherwise it's already NULL or + * pointing to the extent. + */ + if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { + tifp->if_u1.if_extents = + tifp->if_u2.if_inline_ext; + } + target_log_flags |= XFS_ILOG_DEXT; + break; + case XFS_DINODE_FMT_BTREE: + target_log_flags |= XFS_ILOG_DBROOT; + ASSERT(tip->i_d.di_version < 3 || + (target_log_flags & XFS_ILOG_DOWNER)); + break; + } + + xfs_trans_log_inode(tp, ip, src_log_flags); + xfs_trans_log_inode(tp, tip, target_log_flags); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, 0); + + trace_xfs_swap_extent_after(ip, 0); + trace_xfs_swap_extent_after(tip, 1); +out: + kmem_free(tempifp); + return error; + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); + goto out; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + goto out_unlock; +} diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h new file mode 100644 index 00000000000..2fdb72d2c90 --- /dev/null +++ b/fs/xfs/xfs_bmap_util.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BMAP_UTIL_H__ +#define __XFS_BMAP_UTIL_H__ + +/* Kernel only BMAP related definitions and functions */ + +struct xfs_bmbt_irec; +struct xfs_bmap_free_item; +struct xfs_ifork; +struct xfs_inode; +struct xfs_mount; +struct xfs_trans; + +/* + * Argument structure for xfs_bmap_alloc. + */ +struct xfs_bmalloca { + xfs_fsblock_t *firstblock; /* i/o first block allocated */ + struct xfs_bmap_free *flist; /* bmap freelist */ + struct xfs_trans *tp; /* transaction pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bmbt_irec prev; /* extent before the new one */ + struct xfs_bmbt_irec got; /* extent after, or delayed */ + + xfs_fileoff_t offset; /* offset in file filling in */ + xfs_extlen_t length; /* i/o length asked/allocated */ + xfs_fsblock_t blkno; /* starting block of new extent */ + + struct xfs_btree_cur *cur; /* btree cursor */ + xfs_extnum_t idx; /* current extent index */ + int nallocs;/* number of extents alloc'd */ + int logflags;/* flags for transaction logging */ + + xfs_extlen_t total; /* total blocks needed for xaction */ + xfs_extlen_t minlen; /* minimum allocation size (blocks) */ + xfs_extlen_t minleft; /* amount must be left after alloc */ + bool eof; /* set if allocating past last extent */ + bool wasdel; /* replacing a delayed allocation */ + bool userdata;/* set if is user data */ + bool aeof; /* allocated space at eof */ + bool conv; /* overwriting unwritten extents */ + int flags; + struct completion *done; + struct work_struct work; + int result; +}; + +int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, + int *committed); +int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); +int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, + int whichfork, int *eof); +int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, int *count); +int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, + xfs_fileoff_t start_fsb, xfs_fileoff_t length); + +/* bmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *); +int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv, + xfs_bmap_format_t formatter, void *arg); + +/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ +void xfs_bmap_del_free(struct xfs_bmap_free *flist, + struct xfs_bmap_free_item *prev, + struct xfs_bmap_free_item *free); +int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, + struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, + int rt, int eof, int delay, int convert, + xfs_fileoff_t *offp, xfs_extlen_t *lenp); +void xfs_bmap_adjacent(struct xfs_bmalloca *ap); +int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, + int whichfork, struct xfs_bmbt_irec *rec, + int *is_empty); + +/* preallocation and hole punch interface */ +int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len, int alloc_type); +int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t len); +int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); + +/* EOF block manipulation functions */ +bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); +int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip, + bool need_iolock); + +int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, + struct xfs_swapext *sx); + +xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); + +#endif /* __XFS_BMAP_UTIL_H__ */ diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index ee2255bd656..cf893bc1e37 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -17,26 +17,23 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_buf_item.h" #include "xfs_btree.h" -#include "xfs_ialloc.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_alloc.h" /* * Cursor allocation zone. @@ -46,316 +43,242 @@ kmem_zone_t *xfs_btree_cur_zone; /* * Btree magic numbers. */ -const __uint32_t xfs_magics[XFS_BTNUM_MAX] = -{ - XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC +static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = { + { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC, + XFS_FIBT_MAGIC }, + { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, + XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC } }; +#define xfs_btree_magic(cur) \ + xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum] -/* - * Prototypes for internal routines. - */ -/* - * Checking routine: return maxrecs for the block. - */ -STATIC int /* number of records fitting in block */ -xfs_btree_maxrecs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block);/* generic btree block pointer */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_lblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree long form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer for block, if any */ +{ + int lblock_ok = 1; /* block passes checks */ + struct xfs_mount *mp; /* file system mount point */ -/* - * Internal routines. - */ + mp = cur->bc_mp; -/* - * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. - */ -STATIC xfs_btree_block_t * /* generic btree block pointer */ -xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree */ - struct xfs_buf **bpp); /* buffer containing the block */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + lblock_ok = lblock_ok && + uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.l.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } -/* - * Checking routine: return maxrecs for the block. - */ -STATIC int /* number of records fitting in block */ -xfs_btree_maxrecs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block) /* generic btree block pointer */ -{ - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - return (int)XFS_ALLOC_BLOCK_MAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - case XFS_BTNUM_BMAP: - return (int)XFS_BMAP_BLOCK_IMAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - case XFS_BTNUM_INO: - return (int)XFS_INOBT_BLOCK_MAXRECS( - be16_to_cpu(block->bb_h.bb_level), cur); - default: - ASSERT(0); - return 0; + lblock_ok = lblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + block->bb_u.l.bb_leftsib && + (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_leftsib))) && + block->bb_u.l.bb_rightsib && + (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || + XFS_FSB_SANITY_CHECK(mp, + be64_to_cpu(block->bb_u.l.bb_rightsib))); + + if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_LBLOCK, + XFS_RANDOM_BTREE_CHECK_LBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); } + return 0; } -/* - * External routines. - */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sblock( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* btree short form block pointer */ + int level, /* level of the btree block */ + struct xfs_buf *bp) /* buffer containing block */ +{ + struct xfs_mount *mp; /* file system mount point */ + struct xfs_buf *agbp; /* buffer for ag. freespace struct */ + struct xfs_agf *agf; /* ag. freespace structure */ + xfs_agblock_t agflen; /* native ag. freespace length */ + int sblock_ok = 1; /* block passes checks */ + + mp = cur->bc_mp; + agbp = cur->bc_private.a.agbp; + agf = XFS_BUF_TO_AGF(agbp); + agflen = be32_to_cpu(agf->agf_length); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + sblock_ok = sblock_ok && + uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) && + block->bb_u.s.bb_blkno == cpu_to_be64( + bp ? bp->b_bn : XFS_BUF_DADDR_NULL); + } + + sblock_ok = sblock_ok && + be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) && + be16_to_cpu(block->bb_level) == level && + be16_to_cpu(block->bb_numrecs) <= + cur->bc_ops->get_maxrecs(cur, level) && + (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) && + block->bb_u.s.bb_leftsib && + (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) || + be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) && + block->bb_u.s.bb_rightsib; + + if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp, + XFS_ERRTAG_BTREE_CHECK_SBLOCK, + XFS_RANDOM_BTREE_CHECK_SBLOCK))) { + if (bp) + trace_xfs_btree_corrupt(bp, _RET_IP_); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} -#ifdef DEBUG /* * Debug routine: check that block header is ok. */ -void +int xfs_btree_check_block( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block, /* generic btree block pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer containing block, if any */ + struct xfs_buf *bp) /* buffer containing block, if any */ { - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) - xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level, - bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_check_lblock(cur, block, level, bp); else - xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level, - bp); -} - -/* - * Debug routine: check that keys are in the right order. - */ -void -xfs_btree_check_key( - xfs_btnum_t btnum, /* btree identifier */ - void *ak1, /* pointer to left (lower) key */ - void *ak2) /* pointer to right (higher) key */ -{ - switch (btnum) { - case XFS_BTNUM_BNO: { - xfs_alloc_key_t *k1; - xfs_alloc_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)); - break; - } - case XFS_BTNUM_CNT: { - xfs_alloc_key_t *k1; - xfs_alloc_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) || - (k1->ar_blockcount == k2->ar_blockcount && - be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock))); - break; - } - case XFS_BTNUM_BMAP: { - xfs_bmbt_key_t *k1; - xfs_bmbt_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(INT_GET(k1->br_startoff, ARCH_CONVERT) < INT_GET(k2->br_startoff, ARCH_CONVERT)); - break; - } - case XFS_BTNUM_INO: { - xfs_inobt_key_t *k1; - xfs_inobt_key_t *k2; - - k1 = ak1; - k2 = ak2; - ASSERT(INT_GET(k1->ir_startino, ARCH_CONVERT) < INT_GET(k2->ir_startino, ARCH_CONVERT)); - break; - } - default: - ASSERT(0); - } + return xfs_btree_check_sblock(cur, block, level, bp); } -#endif /* DEBUG */ /* - * Checking routine: check that long form block header is ok. + * Check that (long) pointer is ok. */ -/* ARGSUSED */ int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_lblock_t *block, /* btree long form block pointer */ - int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer for block, if any */ +xfs_btree_check_lptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_dfsbno_t bno, /* btree block disk address */ + int level) /* btree block level */ { - int lblock_ok; /* block passes checks */ - xfs_mount_t *mp; /* file system mount point */ - - mp = cur->bc_mp; - lblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && - block->bb_leftsib && - (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO || - XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) && - block->bb_rightsib && - (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO || - XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib))); - if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK, - XFS_RANDOM_BTREE_CHECK_LBLOCK))) { - if (bp) - xfs_buftrace("LBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW, - mp); - return XFS_ERROR(EFSCORRUPTED); - } + XFS_WANT_CORRUPTED_RETURN( + level > 0 && + bno != NULLDFSBNO && + XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); return 0; } +#ifdef DEBUG /* - * Checking routine: check that (long) pointer is ok. + * Check that (short) pointer is ok. */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_dfsbno_t ptr, /* btree block disk address */ - int level) /* btree block level */ +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_sptr( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agblock_t bno, /* btree block disk address */ + int level) /* btree block level */ { - xfs_mount_t *mp; /* file system mount point */ + xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - mp = cur->bc_mp; XFS_WANT_CORRUPTED_RETURN( level > 0 && - ptr != NULLDFSBNO && - XFS_FSB_SANITY_CHECK(mp, ptr)); + bno != NULLAGBLOCK && + bno != 0 && + bno < agblocks); return 0; } -#ifdef DEBUG /* - * Debug routine: check that records are in the right order. + * Check that block ptr is ok. */ -void -xfs_btree_check_rec( - xfs_btnum_t btnum, /* btree identifier */ - void *ar1, /* pointer to left (lower) record */ - void *ar2) /* pointer to right (higher) record */ -{ - switch (btnum) { - case XFS_BTNUM_BNO: { - xfs_alloc_rec_t *r1; - xfs_alloc_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(be32_to_cpu(r1->ar_startblock) + - be32_to_cpu(r1->ar_blockcount) <= - be32_to_cpu(r2->ar_startblock)); - break; - } - case XFS_BTNUM_CNT: { - xfs_alloc_rec_t *r1; - xfs_alloc_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) || - (r1->ar_blockcount == r2->ar_blockcount && - be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock))); - break; - } - case XFS_BTNUM_BMAP: { - xfs_bmbt_rec_t *r1; - xfs_bmbt_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(xfs_bmbt_disk_get_startoff(r1) + - xfs_bmbt_disk_get_blockcount(r1) <= - xfs_bmbt_disk_get_startoff(r2)); - break; - } - case XFS_BTNUM_INO: { - xfs_inobt_rec_t *r1; - xfs_inobt_rec_t *r2; - - r1 = ar1; - r2 = ar2; - ASSERT(INT_GET(r1->ir_startino, ARCH_CONVERT) + XFS_INODES_PER_CHUNK <= - INT_GET(r2->ir_startino, ARCH_CONVERT)); - break; - } - default: - ASSERT(0); +STATIC int /* error (0 or EFSCORRUPTED) */ +xfs_btree_check_ptr( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_ptr *ptr, /* btree block disk address */ + int index, /* offset from ptr to check */ + int level) /* btree block level */ +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + return xfs_btree_check_lptr(cur, + be64_to_cpu((&ptr->l)[index]), level); + } else { + return xfs_btree_check_sptr(cur, + be32_to_cpu((&ptr->s)[index]), level); } } -#endif /* DEBUG */ +#endif /* - * Checking routine: check that block header is ok. + * Calculate CRC on the whole btree block and stuff it into the + * long-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. */ -/* ARGSUSED */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_sblock_t *block, /* btree short form block pointer */ - int level, /* level of the btree block */ - xfs_buf_t *bp) /* buffer containing block */ +void +xfs_btree_lblock_calc_crc( + struct xfs_buf *bp) { - xfs_buf_t *agbp; /* buffer for ag. freespace struct */ - xfs_agf_t *agf; /* ag. freespace structure */ - xfs_agblock_t agflen; /* native ag. freespace length */ - int sblock_ok; /* block passes checks */ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; - agbp = cur->bc_private.a.agbp; - agf = XFS_BUF_TO_AGF(agbp); - agflen = be32_to_cpu(agf->agf_length); - sblock_ok = - be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] && - be16_to_cpu(block->bb_level) == level && - be16_to_cpu(block->bb_numrecs) <= - xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) && - (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK || - be32_to_cpu(block->bb_leftsib) < agflen) && - block->bb_leftsib && - (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK || - be32_to_cpu(block->bb_rightsib) < agflen) && - block->bb_rightsib; - if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp, - XFS_ERRTAG_BTREE_CHECK_SBLOCK, - XFS_RANDOM_BTREE_CHECK_SBLOCK))) { - if (bp) - xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); - return XFS_ERROR(EFSCORRUPTED); - } - return 0; + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); +} + +bool +xfs_btree_lblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + + return true; } /* - * Checking routine: check that (short) pointer is ok. + * Calculate CRC on the whole btree block and stuff it into the + * short-form btree header. + * + * Prior to calculting the CRC, pull the LSN out of the buffer log item and put + * it into the buffer so recovery knows what the last modifcation was that made + * it to disk. */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t ptr, /* btree block disk address */ - int level) /* btree block level */ +void +xfs_btree_sblock_calc_crc( + struct xfs_buf *bp) { - xfs_buf_t *agbp; /* buffer for ag. freespace struct */ - xfs_agf_t *agf; /* ag. freespace structure */ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf_log_item *bip = bp->b_fspriv; - agbp = cur->bc_private.a.agbp; - agf = XFS_BUF_TO_AGF(agbp); - XFS_WANT_CORRUPTED_RETURN( - level > 0 && - ptr != NULLAGBLOCK && ptr != 0 && - ptr < be32_to_cpu(agf->agf_length)); - return 0; + if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return; + if (bip) + block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); +} + +bool +xfs_btree_sblock_verify_crc( + struct xfs_buf *bp) +{ + if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + + return true; } /* @@ -380,7 +303,7 @@ xfs_btree_del_cursor( */ for (i = 0; i < cur->bc_nlevels; i++) { if (cur->bc_bufs[i]) - xfs_btree_setbuf(cur, i, NULL); + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); else if (!error) break; } @@ -414,104 +337,210 @@ xfs_btree_dup_cursor( tp = cur->bc_tp; mp = cur->bc_mp; + /* * Allocate a new cursor like the old one. */ - new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp, - cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip, - cur->bc_private.b.whichfork); + new = cur->bc_ops->dup_cursor(cur); + /* * Copy the record currently in the cursor. */ new->bc_rec = cur->bc_rec; + /* * For each level current, re-get the buffer and copy the ptr value. */ for (i = 0; i < new->bc_nlevels; i++) { new->bc_ptrs[i] = cur->bc_ptrs[i]; new->bc_ra[i] = cur->bc_ra[i]; - if ((bp = cur->bc_bufs[i])) { - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) { + bp = cur->bc_bufs[i]; + if (bp) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_BUF_ADDR(bp), mp->m_bsize, + 0, &bp, + cur->bc_ops->buf_ops); + if (error) { xfs_btree_del_cursor(new, error); *ncur = NULL; return error; } - new->bc_bufs[i] = bp; - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - } else - new->bc_bufs[i] = NULL; - } - /* - * For bmap btrees, copy the firstblock, flist, and flags values, - * since init cursor doesn't get them. - */ - if (new->bc_btnum == XFS_BTNUM_BMAP) { - new->bc_private.b.firstblock = cur->bc_private.b.firstblock; - new->bc_private.b.flist = cur->bc_private.b.flist; - new->bc_private.b.flags = cur->bc_private.b.flags; + } + new->bc_bufs[i] = bp; } *ncur = new; return 0; } /* - * Change the cursor to point to the first record at the given level. - * Other levels are unaffected. + * XFS btree block layout and addressing: + * + * There are two types of blocks in the btree: leaf and non-leaf blocks. + * + * The leaf record start with a header then followed by records containing + * the values. A non-leaf block also starts with the same header, and + * then first contains lookup keys followed by an equal number of pointers + * to the btree blocks at the previous level. + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * +--------+-------+-------+-------+-------+-------+-------+ + * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N | + * +--------+-------+-------+-------+-------+-------+-------+ + * + * The header is called struct xfs_btree_block for reasons better left unknown + * and comes in different versions for short (32bit) and long (64bit) block + * pointers. The record and key structures are defined by the btree instances + * and opaque to the btree core. The block pointers are simple disk endian + * integers, available in a short (32bit) and long (64bit) variant. + * + * The helpers below calculate the offset of a given record, key or pointer + * into a btree block (xfs_btree_*_offset) or return a pointer to the given + * record, key or pointer (xfs_btree_*_addr). Note that all addressing + * inside the btree block is done using indices starting at one, not zero! */ -int /* success=1, failure=0 */ -xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to change */ + +/* + * Return size of the btree block header for this btree instance. + */ +static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur) { - xfs_btree_block_t *block; /* generic btree block pointer */ - xfs_buf_t *bp; /* buffer containing block */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_LBLOCK_CRC_LEN; + return XFS_BTREE_LBLOCK_LEN; + } + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) + return XFS_BTREE_SBLOCK_CRC_LEN; + return XFS_BTREE_SBLOCK_LEN; +} - /* - * Get the block pointer for this level. - */ - block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); - /* - * It's empty, there is no such record. - */ - if (!block->bb_h.bb_numrecs) - return 0; - /* - * Set the ptr value to 1, that's the first record/key. - */ - cur->bc_ptrs[level] = 1; - return 1; +/* + * Return size of btree block pointers for this btree instance. + */ +static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur) +{ + return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + sizeof(__be64) : sizeof(__be32); +} + +/* + * Calculate offset of the n-th record in a btree block. + */ +STATIC size_t +xfs_btree_rec_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->rec_len; +} + +/* + * Calculate offset of the n-th key in a btree block. + */ +STATIC size_t +xfs_btree_key_offset( + struct xfs_btree_cur *cur, + int n) +{ + return xfs_btree_block_len(cur) + + (n - 1) * cur->bc_ops->key_len; +} + +/* + * Calculate offset of the n-th block pointer in a btree block. + */ +STATIC size_t +xfs_btree_ptr_offset( + struct xfs_btree_cur *cur, + int n, + int level) +{ + return xfs_btree_block_len(cur) + + cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len + + (n - 1) * xfs_btree_ptr_len(cur); +} + +/* + * Return a pointer to the n-th record in the btree block. + */ +STATIC union xfs_btree_rec * +xfs_btree_rec_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_rec *) + ((char *)block + xfs_btree_rec_offset(cur, n)); +} + +/* + * Return a pointer to the n-th key in the btree block. + */ +STATIC union xfs_btree_key * +xfs_btree_key_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + return (union xfs_btree_key *) + ((char *)block + xfs_btree_key_offset(cur, n)); +} + +/* + * Return a pointer to the n-th block pointer in the btree block. + */ +STATIC union xfs_btree_ptr * +xfs_btree_ptr_addr( + struct xfs_btree_cur *cur, + int n, + struct xfs_btree_block *block) +{ + int level = xfs_btree_get_level(block); + + ASSERT(block->bb_level != 0); + + return (union xfs_btree_ptr *) + ((char *)block + xfs_btree_ptr_offset(cur, n, level)); +} + +/* + * Get the root block which is stored in the inode. + * + * For now this btree implementation assumes the btree root is always + * stored in the if_broot field of an inode fork. + */ +STATIC struct xfs_btree_block * +xfs_btree_get_iroot( + struct xfs_btree_cur *cur) +{ + struct xfs_ifork *ifp; + + ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); + return (struct xfs_btree_block *)ifp->if_broot; } /* * Retrieve the block pointer from the cursor at the given level. - * This may be a bmap btree root or from a buffer. + * This may be an inode btree root or from a buffer. */ -STATIC xfs_btree_block_t * /* generic btree block pointer */ +STATIC struct xfs_btree_block * /* generic btree block pointer */ xfs_btree_get_block( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level, /* level in btree */ - xfs_buf_t **bpp) /* buffer containing the block */ -{ - xfs_btree_block_t *block; /* return value */ - xfs_buf_t *bp; /* return buffer */ - xfs_ifork_t *ifp; /* inode fork pointer */ - int whichfork; /* data or attr fork */ - - if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) { - whichfork = cur->bc_private.b.whichfork; - ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork); - block = (xfs_btree_block_t *)ifp->if_broot; - bp = NULL; - } else { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_BLOCK(bp); + struct xfs_buf **bpp) /* buffer containing the block */ +{ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *bpp = NULL; + return xfs_btree_get_iroot(cur); } - ASSERT(block != NULL); - *bpp = bp; - return block; + + *bpp = cur->bc_bufs[level]; + return XFS_BUF_TO_BLOCK(*bpp); } /* @@ -525,15 +554,11 @@ xfs_btree_get_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* @@ -548,138 +573,72 @@ xfs_btree_get_bufs( xfs_agblock_t agbno, /* allocation group block number */ uint lock) /* lock flags for get_buf */ { - xfs_buf_t *bp; /* buffer pointer (return value) */ xfs_daddr_t d; /* real disk block address */ ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); - ASSERT(bp); - ASSERT(!XFS_BUF_GETERROR(bp)); - return bp; + return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock); } /* - * Allocate a new btree cursor. - * The cursor is either for allocation (A) or bmap (B) or inodes (I). + * Check for the cursor referring to the last block at the given level. */ -xfs_btree_cur_t * /* new btree cursor */ -xfs_btree_init_cursor( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *agbp, /* (A only) buffer for agf structure */ - /* (I only) buffer for agi structure */ - xfs_agnumber_t agno, /* (AI only) allocation group number */ - xfs_btnum_t btnum, /* btree identifier */ - xfs_inode_t *ip, /* (B only) inode owning the btree */ - int whichfork) /* (B only) data or attr fork */ +int /* 1=is last block, 0=not last block */ +xfs_btree_islastblock( + xfs_btree_cur_t *cur, /* btree cursor */ + int level) /* level to check */ { - xfs_agf_t *agf; /* (A) allocation group freespace */ - xfs_agi_t *agi; /* (I) allocation group inodespace */ - xfs_btree_cur_t *cur; /* return value */ - xfs_ifork_t *ifp; /* (I) inode fork pointer */ - int nlevels=0; /* number of levels in the btree */ + struct xfs_btree_block *block; /* generic btree block pointer */ + xfs_buf_t *bp; /* buffer containing block */ - ASSERT(xfs_btree_cur_zone != NULL); - /* - * Allocate a new cursor. - */ - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); - /* - * Deduce the number of btree levels from the arguments. - */ - switch (btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - agf = XFS_BUF_TO_AGF(agbp); - nlevels = be32_to_cpu(agf->agf_levels[btnum]); - break; - case XFS_BTNUM_BMAP: - ifp = XFS_IFORK_PTR(ip, whichfork); - nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - break; - case XFS_BTNUM_INO: - agi = XFS_BUF_TO_AGI(agbp); - nlevels = be32_to_cpu(agi->agi_level); - break; - default: - ASSERT(0); - } - /* - * Fill in the common fields. - */ - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_nlevels = nlevels; - cur->bc_btnum = btnum; - cur->bc_blocklog = mp->m_sb.sb_blocklog; - /* - * Fill in private fields. - */ - switch (btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - /* - * Allocation btree fields. - */ - cur->bc_private.a.agbp = agbp; - cur->bc_private.a.agno = agno; - break; - case XFS_BTNUM_BMAP: - /* - * Bmap btree fields. - */ - cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork); - cur->bc_private.b.ip = ip; - cur->bc_private.b.firstblock = NULLFSBLOCK; - cur->bc_private.b.flist = NULL; - cur->bc_private.b.allocated = 0; - cur->bc_private.b.flags = 0; - cur->bc_private.b.whichfork = whichfork; - break; - case XFS_BTNUM_INO: - /* - * Inode allocation btree fields. - */ - cur->bc_private.i.agbp = agbp; - cur->bc_private.i.agno = agno; - break; - default: - ASSERT(0); - } - return cur; + block = xfs_btree_get_block(cur, level, &bp); + xfs_btree_check_block(cur, block, level, bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO); + else + return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); } /* - * Check for the cursor referring to the last block at the given level. + * Change the cursor to point to the first record at the given level. + * Other levels are unaffected. */ -int /* 1=is last block, 0=not last block */ -xfs_btree_islastblock( +STATIC int /* success=1, failure=0 */ +xfs_btree_firstrec( xfs_btree_cur_t *cur, /* btree cursor */ - int level) /* level to check */ + int level) /* level to change */ { - xfs_btree_block_t *block; /* generic btree block pointer */ + struct xfs_btree_block *block; /* generic btree block pointer */ xfs_buf_t *bp; /* buffer containing block */ + /* + * Get the block pointer for this level. + */ block = xfs_btree_get_block(cur, level, &bp); xfs_btree_check_block(cur, block, level, bp); - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) - return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO; - else - return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK; + /* + * It's empty, there is no such record. + */ + if (!block->bb_numrecs) + return 0; + /* + * Set the ptr value to 1, that's the first record/key. + */ + cur->bc_ptrs[level] = 1; + return 1; } /* * Change the cursor to point to the last record in the current block * at the given level. Other levels are unaffected. */ -int /* success=1, failure=0 */ +STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( xfs_btree_cur_t *cur, /* btree cursor */ int level) /* level to change */ { - xfs_btree_block_t *block; /* generic btree block pointer */ + struct xfs_btree_block *block; /* generic btree block pointer */ xfs_buf_t *bp; /* buffer containing block */ /* @@ -690,12 +649,12 @@ xfs_btree_lastrec( /* * It's empty, there is no such record. */ - if (!block->bb_h.bb_numrecs) + if (!block->bb_numrecs) return 0; /* * Set the ptr value to numrecs, that's the last record/key. */ - cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs); + cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); return 1; } @@ -739,69 +698,28 @@ xfs_btree_offsets( * Get a buffer for the block, return it read in. * Long-form addressing. */ -int /* error */ +int xfs_btree_read_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_fsblock_t fsbno, /* file system block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for fsbno */ - int refval) /* ref count value for buffer */ + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_fsblock_t fsbno, /* file system block number */ + uint lock, /* lock flags for read_buf */ + struct xfs_buf **bpp, /* buffer for fsbno */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; /* return value */ + struct xfs_buf *bp; /* return value */ xfs_daddr_t d; /* real disk block address */ - int error; + int error; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, + mp->m_bsize, lock, &bp, ops); + if (error) return error; - } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - } - *bpp = bp; - return 0; -} - -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - xfs_buf_t **bpp, /* buffer for agno/agbno */ - int refval) /* ref count value for buffer */ -{ - xfs_buf_t *bp; /* return value */ - xfs_daddr_t d; /* real disk block address */ - int error; - - ASSERT(agno != NULLAGNUMBER); - ASSERT(agbno != NULLAGBLOCK); - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, lock, &bp))) { - return error; - } - ASSERT(!bp || !XFS_BUF_GETERROR(bp)); - if (bp != NULL) { - switch (refval) { - case XFS_ALLOC_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); - break; - case XFS_INO_BTREE_REF: - XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval); - break; - } - } + if (bp) + xfs_buf_set_ref(bp, refval); *bpp = bp; return 0; } @@ -813,15 +731,16 @@ xfs_btree_read_bufs( /* ARGSUSED */ void xfs_btree_reada_bufl( - xfs_mount_t *mp, /* file system mount point */ - xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_fsblock_t fsbno, /* file system block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(fsbno != NULLFSBLOCK); d = XFS_FSB_TO_DADDR(mp, fsbno); - xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); } /* @@ -831,111 +750,3320 @@ xfs_btree_reada_bufl( /* ARGSUSED */ void xfs_btree_reada_bufs( - xfs_mount_t *mp, /* file system mount point */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count) /* count of filesystem blocks */ + struct xfs_mount *mp, /* file system mount point */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_agblock_t agbno, /* allocation group block number */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops) { xfs_daddr_t d; ASSERT(agno != NULLAGNUMBER); ASSERT(agbno != NULLAGBLOCK); d = XFS_AGB_TO_DADDR(mp, agno, agbno); - xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count); + xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops); +} + +STATIC int +xfs_btree_readahead_lblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); + xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, left, 1, + cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { + xfs_btree_reada_bufl(cur->bc_mp, right, 1, + cur->bc_ops->buf_ops); + rval++; + } + + return rval; +} + +STATIC int +xfs_btree_readahead_sblock( + struct xfs_btree_cur *cur, + int lr, + struct xfs_btree_block *block) +{ + int rval = 0; + xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib); + xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib); + + + if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + left, 1, cur->bc_ops->buf_ops); + rval++; + } + + if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) { + xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, + right, 1, cur->bc_ops->buf_ops); + rval++; + } + + return rval; } /* * Read-ahead btree blocks, at the given level. * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. */ -int -xfs_btree_readahead_core( - xfs_btree_cur_t *cur, /* btree cursor */ +STATIC int +xfs_btree_readahead( + struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ int lr) /* left/right bits */ { - xfs_alloc_block_t *a; - xfs_bmbt_block_t *b; - xfs_inobt_block_t *i; - int rval = 0; + struct xfs_btree_block *block; + + /* + * No readahead needed if we are at the root level and the + * btree root is stored in the inode. + */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (lev == cur->bc_nlevels - 1)) + return 0; + + if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + return 0; - ASSERT(cur->bc_bufs[lev] != NULL); cur->bc_ra[lev] |= lr; - switch (cur->bc_btnum) { - case XFS_BTNUM_BNO: - case XFS_BTNUM_CNT: - a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(a->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno, - be32_to_cpu(a->bb_rightsib), 1); - rval++; - } - break; - case XFS_BTNUM_BMAP: - b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) { - xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1); - rval++; - } - break; - case XFS_BTNUM_INO: - i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); - if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, - be32_to_cpu(i->bb_leftsib), 1); - rval++; - } - if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) { - xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno, - be32_to_cpu(i->bb_rightsib), 1); - rval++; - } - break; - default: - ASSERT(0); + block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return xfs_btree_readahead_lblock(cur, lr, block); + return xfs_btree_readahead_sblock(cur, lr, block); +} + +STATIC xfs_daddr_t +xfs_btree_ptr_to_daddr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); + + return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); + } else { + ASSERT(cur->bc_private.a.agno != NULLAGNUMBER); + ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK)); + + return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno, + be32_to_cpu(ptr->s)); } - return rval; +} + +/* + * Readahead @count btree blocks at the given @ptr location. + * + * We don't need to care about long or short form btrees here as we have a + * method of converting the ptr directly to a daddr available to us. + */ +STATIC void +xfs_btree_readahead_ptr( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + xfs_extlen_t count) +{ + xfs_buf_readahead(cur->bc_mp->m_ddev_targp, + xfs_btree_ptr_to_daddr(cur, ptr), + cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops); } /* * Set the buffer for level "lev" in the cursor to bp, releasing * any previous buffer. */ -void +STATIC void xfs_btree_setbuf( xfs_btree_cur_t *cur, /* btree cursor */ int lev, /* level in btree */ xfs_buf_t *bp) /* new buffer to set */ { - xfs_btree_block_t *b; /* btree block */ - xfs_buf_t *obp; /* old buffer pointer */ + struct xfs_btree_block *b; /* btree block */ - obp = cur->bc_bufs[lev]; - if (obp) - xfs_trans_brelse(cur->bc_tp, obp); + if (cur->bc_bufs[lev]) + xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); cur->bc_bufs[lev] = bp; cur->bc_ra[lev] = 0; - if (!bp) - return; + b = XFS_BUF_TO_BLOCK(bp); - if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) { - if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; - if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO) + if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)) cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } else { - if (be32_to_cpu(b->bb_u.s.bb_leftsib) == NULLAGBLOCK) + if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; - if (be32_to_cpu(b->bb_u.s.bb_rightsib) == NULLAGBLOCK) + if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; } } + +STATIC int +xfs_btree_ptr_is_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return ptr->l == cpu_to_be64(NULLDFSBNO); + else + return ptr->s == cpu_to_be32(NULLAGBLOCK); +} + +STATIC void +xfs_btree_set_ptr_null( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(NULLDFSBNO); + else + ptr->s = cpu_to_be32(NULLAGBLOCK); +} + +/* + * Get/set/init sibling pointers + */ +STATIC void +xfs_btree_get_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + ptr->l = block->bb_u.l.bb_rightsib; + else + ptr->l = block->bb_u.l.bb_leftsib; + } else { + if (lr == XFS_BB_RIGHTSIB) + ptr->s = block->bb_u.s.bb_rightsib; + else + ptr->s = block->bb_u.s.bb_leftsib; + } +} + +STATIC void +xfs_btree_set_sibling( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_ptr *ptr, + int lr) +{ + ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); + + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.l.bb_rightsib = ptr->l; + else + block->bb_u.l.bb_leftsib = ptr->l; + } else { + if (lr == XFS_BB_RIGHTSIB) + block->bb_u.s.bb_rightsib = ptr->s; + else + block->bb_u.s.bb_leftsib = ptr->s; + } +} + +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + buf->bb_magic = cpu_to_be32(magic); + buf->bb_level = cpu_to_be16(level); + buf->bb_numrecs = cpu_to_be16(numrecs); + + if (flags & XFS_BTREE_LONG_PTRS) { + buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); + buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.l.bb_owner = cpu_to_be64(owner); + uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.l.bb_pad = 0; + buf->bb_u.l.bb_lsn = 0; + } + } else { + /* owner is a 32 bit value on short blocks */ + __u32 __owner = (__u32)owner; + + buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK); + buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK); + if (flags & XFS_BTREE_CRC_BLOCKS) { + buf->bb_u.s.bb_blkno = cpu_to_be64(blkno); + buf->bb_u.s.bb_owner = cpu_to_be32(__owner); + uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid); + buf->bb_u.s.bb_lsn = 0; + } + } +} + +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags) +{ + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + magic, level, numrecs, owner, flags); +} + +STATIC void +xfs_btree_init_block_cur( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + int numrecs) +{ + __u64 owner; + + /* + * we can pull the owner from the cursor right now as the different + * owners align directly with the pointer size of the btree. This may + * change in future, but is safe for current users of the generic btree + * code. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + owner = cur->bc_private.b.ip->i_ino; + else + owner = cur->bc_private.a.agno; + + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_magic(cur), level, numrecs, + owner, cur->bc_flags); +} + +/* + * Return true if ptr is the last record in the btree and + * we need to track updates to this record. The decision + * will be further refined in the update_lastrec method. + */ +STATIC int +xfs_btree_is_lastrec( + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + int level) +{ + union xfs_btree_ptr ptr; + + if (level > 0) + return 0; + if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE)) + return 0; + + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &ptr)) + return 0; + return 1; +} + +STATIC void +xfs_btree_buf_to_ptr( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + union xfs_btree_ptr *ptr) +{ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, + XFS_BUF_ADDR(bp))); + else { + ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, + XFS_BUF_ADDR(bp))); + } +} + +STATIC void +xfs_btree_set_refs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) +{ + switch (cur->bc_btnum) { + case XFS_BTNUM_BNO: + case XFS_BTNUM_CNT: + xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF); + break; + case XFS_BTNUM_INO: + case XFS_BTNUM_FINO: + xfs_buf_set_ref(bp, XFS_INO_BTREE_REF); + break; + case XFS_BTNUM_BMAP: + xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF); + break; + default: + ASSERT(0); + } +} + +STATIC int +xfs_btree_get_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags); + + if (!*bpp) + return ENOMEM; + + (*bpp)->b_ops = cur->bc_ops->buf_ops; + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Read in the buffer at the given ptr and return the buffer and + * the block pointer within the buffer. + */ +STATIC int +xfs_btree_read_buf_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = cur->bc_mp; + xfs_daddr_t d; + int error; + + /* need to sort out how callers deal with failures first */ + ASSERT(!(flags & XBF_TRYLOCK)); + + d = xfs_btree_ptr_to_daddr(cur, ptr); + error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d, + mp->m_bsize, flags, bpp, + cur->bc_ops->buf_ops); + if (error) + return error; + + xfs_btree_set_refs(cur, *bpp); + *block = XFS_BUF_TO_BLOCK(*bpp); + return 0; +} + +/* + * Copy keys from one btree block to another. + */ +STATIC void +xfs_btree_copy_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + union xfs_btree_key *src_key, + int numkeys) +{ + ASSERT(numkeys >= 0); + memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); +} + +/* + * Copy records from one btree block to another. + */ +STATIC void +xfs_btree_copy_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *dst_rec, + union xfs_btree_rec *src_rec, + int numrecs) +{ + ASSERT(numrecs >= 0); + memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Copy block pointers from one btree block to another. + */ +STATIC void +xfs_btree_copy_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *dst_ptr, + union xfs_btree_ptr *src_ptr, + int numptrs) +{ + ASSERT(numptrs >= 0); + memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Shift keys one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *key, + int dir, + int numkeys) +{ + char *dst_key; + + ASSERT(numkeys >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_key = (char *)key + (dir * cur->bc_ops->key_len); + memmove(dst_key, key, numkeys * cur->bc_ops->key_len); +} + +/* + * Shift records one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_recs( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + int dir, + int numrecs) +{ + char *dst_rec; + + ASSERT(numrecs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len); + memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len); +} + +/* + * Shift block pointers one index left/right inside a single btree block. + */ +STATIC void +xfs_btree_shift_ptrs( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + int dir, + int numptrs) +{ + char *dst_ptr; + + ASSERT(numptrs >= 0); + ASSERT(dir == 1 || dir == -1); + + dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur)); + memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur)); +} + +/* + * Log key values from the btree block. + */ +STATIC void +xfs_btree_log_keys( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_key_offset(cur, first), + xfs_btree_key_offset(cur, last + 1) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log record values from the btree block. + */ +void +xfs_btree_log_recs( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int first, + int last) +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_rec_offset(cur, first), + xfs_btree_rec_offset(cur, last + 1) - 1); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log block pointer fields from a btree block (nonleaf). + */ +STATIC void +xfs_btree_log_ptrs( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int first, /* index of first pointer to log */ + int last) /* index of last pointer to log */ +{ + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBII(cur, bp, first, last); + + if (bp) { + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + int level = xfs_btree_get_level(block); + + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, + xfs_btree_ptr_offset(cur, first, level), + xfs_btree_ptr_offset(cur, last + 1, level) - 1); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Log fields from a btree block header. + */ +void +xfs_btree_log_block( + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_buf *bp, /* buffer containing btree block */ + int fields) /* mask of fields: XFS_BB_... */ +{ + int first; /* first byte offset logged */ + int last; /* last byte offset logged */ + static const short soffsets[] = { /* table of offsets (short) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.s.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.s.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.s.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.s.bb_owner), + offsetof(struct xfs_btree_block, bb_u.s.bb_crc), + XFS_BTREE_SBLOCK_CRC_LEN + }; + static const short loffsets[] = { /* table of offsets (long) */ + offsetof(struct xfs_btree_block, bb_magic), + offsetof(struct xfs_btree_block, bb_level), + offsetof(struct xfs_btree_block, bb_numrecs), + offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib), + offsetof(struct xfs_btree_block, bb_u.l.bb_blkno), + offsetof(struct xfs_btree_block, bb_u.l.bb_lsn), + offsetof(struct xfs_btree_block, bb_u.l.bb_uuid), + offsetof(struct xfs_btree_block, bb_u.l.bb_owner), + offsetof(struct xfs_btree_block, bb_u.l.bb_crc), + offsetof(struct xfs_btree_block, bb_u.l.bb_pad), + XFS_BTREE_LBLOCK_CRC_LEN + }; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGBI(cur, bp, fields); + + if (bp) { + int nbits; + + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + /* + * We don't log the CRC when updating a btree + * block but instead recreate it during log + * recovery. As the log buffers have checksums + * of their own this is safe and avoids logging a crc + * update in a lot of places. + */ + if (fields == XFS_BB_ALL_BITS) + fields = XFS_BB_ALL_BITS_CRC; + nbits = XFS_BB_NUM_BITS_CRC; + } else { + nbits = XFS_BB_NUM_BITS; + } + xfs_btree_offsets(fields, + (cur->bc_flags & XFS_BTREE_LONG_PTRS) ? + loffsets : soffsets, + nbits, &first, &last); + xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(cur->bc_tp, bp, first, last); + } else { + xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, + xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); +} + +/* + * Increment cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_increment( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + union xfs_btree_ptr ptr; + struct xfs_buf *bp; + int error; /* error return value */ + int lev; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the right at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* We're done if we remain in the block after the increment. */ + if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + goto out1; + + /* Fail if we just went off the right edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, increment); + + /* + * March up the tree incrementing pointers. + * Stop when we don't go off the right edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + block = xfs_btree_get_block(cur, lev, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, lev, bp); + if (error) + goto error0; +#endif + + if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + break; + + /* Read-ahead the right block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); + } + + /* + * If we went off the root then we are either seriously + * confused or have the tree root in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = 1; + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Decrement cursor by one record at the level. + * For nonzero levels the leaf-ward information is untouched. + */ +int /* error */ +xfs_btree_decrement( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; + xfs_buf_t *bp; + int error; /* error return value */ + int lev; + union xfs_btree_ptr ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + ASSERT(level < cur->bc_nlevels); + + /* Read-ahead to the left at this level. */ + xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); + + /* We're done if we remain in the block after the decrement. */ + if (--cur->bc_ptrs[level] > 0) + goto out1; + + /* Get a pointer to the btree block. */ + block = xfs_btree_get_block(cur, level, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we just went off the left edge of the tree. */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &ptr)) + goto out0; + + XFS_BTREE_STATS_INC(cur, decrement); + + /* + * March up the tree decrementing pointers. + * Stop when we don't go off the left edge of a block. + */ + for (lev = level + 1; lev < cur->bc_nlevels; lev++) { + if (--cur->bc_ptrs[lev] > 0) + break; + /* Read-ahead the left block for the next loop. */ + xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + } + + /* + * If we went off the root then we are seriously confused. + * or the root of the tree is in an inode. + */ + if (lev == cur->bc_nlevels) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + goto out0; + ASSERT(0); + error = EFSCORRUPTED; + goto error0; + } + ASSERT(lev < cur->bc_nlevels); + + /* + * Now walk back down the tree, fixing up the cursor's buffer + * pointers and key numbers. + */ + for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { + union xfs_btree_ptr *ptrp; + + ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + --lev; + error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); + if (error) + goto error0; + xfs_btree_setbuf(cur, lev, bp); + cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + } +out1: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +STATIC int +xfs_btree_lookup_get_block( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ +{ + struct xfs_buf *bp; /* buffer pointer for btree block */ + int error = 0; + + /* special case the root block if in an inode */ + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) { + *blkp = xfs_btree_get_iroot(cur); + return 0; + } + + /* + * If the old buffer at this level for the disk address we are + * looking for re-use it. + * + * Otherwise throw it away and get a new one. + */ + bp = cur->bc_bufs[level]; + if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) { + *blkp = XFS_BUF_TO_BLOCK(bp); + return 0; + } + + error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp); + if (error) + return error; + + xfs_btree_setbuf(cur, level, bp); + return 0; +} + +/* + * Get current search key. For level 0 we don't actually have a key + * structure so we make one up from the record. For all other levels + * we just return the right key. + */ +STATIC union xfs_btree_key * +xfs_lookup_get_search_key( + struct xfs_btree_cur *cur, + int level, + int keyno, + struct xfs_btree_block *block, + union xfs_btree_key *kp) +{ + if (level == 0) { + cur->bc_ops->init_key_from_rec(kp, + xfs_btree_rec_addr(cur, keyno, block)); + return kp; + } + + return xfs_btree_key_addr(cur, keyno, block); +} + +/* + * Lookup the record. The cursor is made to point to it, based on dir. + * stat is set to 0 if can't find any such record, 1 for success. + */ +int /* error */ +xfs_btree_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_lookup_t dir, /* <=, ==, or >= */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* current btree block */ + __int64_t diff; /* difference for the current key */ + int error; /* error return value */ + int keyno; /* current key number */ + int level; /* level in the btree */ + union xfs_btree_ptr *pp; /* ptr to btree block */ + union xfs_btree_ptr ptr; /* ptr to btree block */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, dir); + + XFS_BTREE_STATS_INC(cur, lookup); + + block = NULL; + keyno = 0; + + /* initialise start pointer from cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &ptr); + pp = &ptr; + + /* + * Iterate over each level in the btree, starting at the root. + * For each level above the leaves, find the key we need, based + * on the lookup record, then follow the corresponding block + * pointer down to the next level. + */ + for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { + /* Get the block we need to do the lookup on. */ + error = xfs_btree_lookup_get_block(cur, level, pp, &block); + if (error) + goto error0; + + if (diff == 0) { + /* + * If we already had a key match at a higher level, we + * know we need to use the first entry in this block. + */ + keyno = 1; + } else { + /* Otherwise search this block. Do a binary search. */ + + int high; /* high entry number */ + int low; /* low entry number */ + + /* Set low and high entry numbers, 1-based. */ + low = 1; + high = xfs_btree_get_numrecs(block); + if (!high) { + /* Block is empty, must be an empty leaf. */ + ASSERT(level == 0 && cur->bc_nlevels == 1); + + cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Binary search the block. */ + while (low <= high) { + union xfs_btree_key key; + union xfs_btree_key *kp; + + XFS_BTREE_STATS_INC(cur, compare); + + /* keyno is average of low and high. */ + keyno = (low + high) >> 1; + + /* Get current search key */ + kp = xfs_lookup_get_search_key(cur, level, + keyno, block, &key); + + /* + * Compute difference to get next direction: + * - less than, move right + * - greater than, move left + * - equal, we're done + */ + diff = cur->bc_ops->key_diff(cur, kp); + if (diff < 0) + low = keyno + 1; + else if (diff > 0) + high = keyno - 1; + else + break; + } + } + + /* + * If there are more levels, set up for the next level + * by getting the block number and filling in the cursor. + */ + if (level > 0) { + /* + * If we moved left, need the previous key number, + * unless there isn't one. + */ + if (diff > 0 && --keyno < 1) + keyno = 1; + pp = xfs_btree_ptr_addr(cur, keyno, block); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, pp, 0, level); + if (error) + goto error0; +#endif + cur->bc_ptrs[level] = keyno; + } + } + + /* Done with the search. See if we need to adjust the results. */ + if (dir != XFS_LOOKUP_LE && diff < 0) { + keyno++; + /* + * If ge search and we went off the end of the block, but it's + * not the last block, we're in the wrong block. + */ + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + if (dir == XFS_LOOKUP_GE && + keyno > xfs_btree_get_numrecs(block) && + !xfs_btree_ptr_is_null(cur, &ptr)) { + int i; + + cur->bc_ptrs[0] = keyno; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } else if (dir == XFS_LOOKUP_LE && diff > 0) + keyno--; + cur->bc_ptrs[0] = keyno; + + /* Return if we succeeded or not. */ + if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) + *stat = 0; + else if (dir != XFS_LOOKUP_EQ || diff == 0) + *stat = 1; + else + *stat = 0; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Update keys at all levels from here to the root along the cursor's path. + */ +STATIC int +xfs_btree_updkey( + struct xfs_btree_cur *cur, + union xfs_btree_key *keyp, + int level) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_key *kp; + int ptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIK(cur, level, keyp); + + ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1); + + /* + * Go up the tree from this level toward the root. + * At each level, update the key value to the value input. + * Stop when we reach a level where the cursor isn't pointing + * at the first entry in the block. + */ + for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { +#ifdef DEBUG + int error; +#endif + block = xfs_btree_get_block(cur, level, &bp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } +#endif + ptr = cur->bc_ptrs[level]; + kp = xfs_btree_key_addr(cur, ptr, block); + xfs_btree_copy_keys(cur, kp, keyp, 1); + xfs_btree_log_keys(cur, bp, ptr, ptr); + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Update the record referred to by cur to the value in the + * given record. This either works (return 0) or gets an + * EFSCORRUPTED error. + */ +int +xfs_btree_update( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + int error; + int ptr; + union xfs_btree_rec *rp; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGR(cur, rec); + + /* Pick up the current block. */ + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + goto error0; +#endif + /* Get the address of the rec to be updated. */ + ptr = cur->bc_ptrs[0]; + rp = xfs_btree_rec_addr(cur, ptr, block); + + /* Fill in the new contents and log them. */ + xfs_btree_copy_recs(cur, rp, rec, 1); + xfs_btree_log_recs(cur, bp, ptr, ptr); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, 0)) { + cur->bc_ops->update_lastrec(cur, block, rec, + ptr, LASTREC_UPDATE); + } + + /* Updating first rec in leaf. Pass new key value up to our parent. */ + if (ptr == 1) { + union xfs_btree_key key; + + cur->bc_ops->init_key_from_rec(&key, rec); + error = xfs_btree_updkey(cur, &key, 1); + if (error) + goto error0; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record left from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_lshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs; /* left record count */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + int rrecs; /* right record count */ + union xfs_btree_ptr lptr; /* left btree pointer */ + union xfs_btree_key *rkp = NULL; /* right btree key */ + union xfs_btree_ptr *rpp = NULL; /* right address pointer */ + union xfs_btree_rec *rrp = NULL; /* right record pointer */ + int error; /* error return value */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) + goto out0; + + /* Set up variables for this block as "right". */ + right = xfs_btree_get_block(cur, level, &rbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, right, level, rbp); + if (error) + goto error0; +#endif + + /* If we've got no left sibling then we can't shift an entry left. */ + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + if (xfs_btree_ptr_is_null(cur, &lptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + if (cur->bc_ptrs[level] <= 1) + goto out0; + + /* Set up the left neighbor as "left". */ + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + lrecs = xfs_btree_get_numrecs(left); + if (lrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + rrecs = xfs_btree_get_numrecs(right); + + /* + * We add one entry to the left side and remove one for the right side. + * Account for it here, the changes will be updated on disk and logged + * later. + */ + lrecs++; + rrecs--; + + XFS_BTREE_STATS_INC(cur, lshift); + XFS_BTREE_STATS_ADD(cur, moves, 1); + + /* + * If non-leaf, copy a key and a ptr to the left block. + * Log the changes to the left block. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, rpp, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_keys(cur, lkp, rkp, 1); + xfs_btree_copy_ptrs(cur, lpp, rpp, 1); + + xfs_btree_log_keys(cur, lbp, lrecs, lrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->keys_inorder(cur, + xfs_btree_key_addr(cur, lrecs - 1, left), lkp)); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, 1); + xfs_btree_log_recs(cur, lbp, lrecs, lrecs); + + ASSERT(cur->bc_ops->recs_inorder(cur, + xfs_btree_rec_addr(cur, lrecs - 1, left), lrp)); + } + + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Slide the contents of right down one entry. + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1); + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ +#ifdef DEBUG + int i; /* loop index */ + + for (i = 0; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i + 1, level); + if (error) + goto error0; + } +#endif + xfs_btree_shift_keys(cur, + xfs_btree_key_addr(cur, 2, right), + -1, rrecs); + xfs_btree_shift_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, right), + -1, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + } else { + /* It's a leaf. operate on records */ + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, 2, right), + -1, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, right)); + rkp = &key; + } + + /* Update the parent key values of right. */ + error = xfs_btree_updkey(cur, rkp, level + 1); + if (error) + goto error0; + + /* Slide the cursor value left one. */ + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Move 1 record right from cur/level if possible. + * Update cur to reflect the new path. + */ +STATIC int /* error */ +xfs_btree_rshift( + struct xfs_btree_cur *cur, + int level, + int *stat) /* success/failure */ +{ + union xfs_btree_key key; /* btree key */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + union xfs_btree_ptr rptr; /* right block pointer */ + union xfs_btree_key *rkp; /* right btree key */ + int rrecs; /* right record count */ + int lrecs; /* left record count */ + int error; /* error return value */ + int i; /* loop counter */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level == cur->bc_nlevels - 1)) + goto out0; + + /* Set up variables for this block as "left". */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + /* If we've got no right sibling then we can't shift an entry right. */ + xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + goto out0; + + /* + * If the cursor entry is the one that would be moved, don't + * do it... it's too complicated. + */ + lrecs = xfs_btree_get_numrecs(left); + if (cur->bc_ptrs[level] >= lrecs) + goto out0; + + /* Set up the right neighbor as "right". */ + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* If it's full, it can't take another entry. */ + rrecs = xfs_btree_get_numrecs(right); + if (rrecs == cur->bc_ops->get_maxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, rshift); + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Make a hole at the start of the right neighbor block, then + * copy the last left block entry to the hole. + */ + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + union xfs_btree_ptr *rpp; + + lkp = xfs_btree_key_addr(cur, lrecs, left); + lpp = xfs_btree_ptr_addr(cur, lrecs, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = rrecs - 1; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_shift_keys(cur, rkp, 1, rrecs); + xfs_btree_shift_ptrs(cur, rpp, 1, rrecs); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, lpp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, and log it. */ + xfs_btree_copy_keys(cur, rkp, lkp, 1); + xfs_btree_copy_ptrs(cur, rpp, lpp, 1); + + xfs_btree_log_keys(cur, rbp, 1, rrecs + 1); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1); + + ASSERT(cur->bc_ops->keys_inorder(cur, rkp, + xfs_btree_key_addr(cur, 2, right))); + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *lrp; + union xfs_btree_rec *rrp; + + lrp = xfs_btree_rec_addr(cur, lrecs, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_shift_recs(cur, rrp, 1, rrecs); + + /* Now put the new data in, and log it. */ + xfs_btree_copy_recs(cur, rrp, lrp, 1); + xfs_btree_log_recs(cur, rbp, 1, rrecs + 1); + + cur->bc_ops->init_key_from_rec(&key, rrp); + rkp = &key; + + ASSERT(cur->bc_ops->recs_inorder(cur, rrp, + xfs_btree_rec_addr(cur, 2, right))); + } + + /* + * Decrement and log left's numrecs, bump and log right's numrecs. + */ + xfs_btree_set_numrecs(left, --lrecs); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS); + + xfs_btree_set_numrecs(right, ++rrecs); + xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS); + + /* + * Using a temporary cursor, update the parent key values of the + * block on the right. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error1; + + error = xfs_btree_updkey(tcur, rkp, level + 1); + if (error) + goto error1; + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + +error1: + XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR); + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Split cur/level block in half. + * Return new block number and the key to its first + * record (to be inserted into parent). + */ +STATIC int /* error */ +__xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rrptr; /* right-right sibling ptr */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + int lrecs; + int rrecs; + int src_index; + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key); + + XFS_BTREE_STATS_INC(cur, split); + + /* Set up left block (current one). */ + left = xfs_btree_get_block(cur, level, &lbp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block as "right". */ + error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* Fill in the btree header for the new right block. */ + xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0); + + /* + * Split the entries between the old and the new block evenly. + * Make sure that if there's an odd number of entries now, that + * each new block will have the same number of entries. + */ + lrecs = xfs_btree_get_numrecs(left); + rrecs = lrecs / 2; + if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + rrecs++; + src_index = (lrecs - rrecs + 1); + + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + + /* + * Copy btree block entries from the left block over to the + * new block, the right. Update the right block and log the + * changes. + */ + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, src_index, left); + lpp = xfs_btree_ptr_addr(cur, src_index, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); + +#ifdef DEBUG + for (i = src_index; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + xfs_btree_copy_keys(cur, rkp, lkp, rrecs); + xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs); + + xfs_btree_log_keys(cur, rbp, 1, rrecs); + xfs_btree_log_ptrs(cur, rbp, 1, rrecs); + + /* Grab the keys to the entries moved to the right block */ + xfs_btree_copy_keys(cur, key, rkp, 1); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, src_index, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, rrp, lrp, rrecs); + xfs_btree_log_recs(cur, rbp, 1, rrecs); + + cur->bc_ops->init_key_from_rec(key, + xfs_btree_rec_addr(cur, 1, right)); + } + + + /* + * Find the left block number by looking in the buffer. + * Adjust numrecs, sibling pointers. + */ + xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB); + xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB); + + lrecs -= rrecs; + xfs_btree_set_numrecs(left, lrecs); + xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs); + + xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* + * If there's a block to the new block's right, make that block + * point back to right instead of to left. + */ + if (!xfs_btree_ptr_is_null(cur, &rrptr)) { + error = xfs_btree_read_buf_block(cur, &rrptr, + 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + /* + * If the cursor is really in the right block, move it there. + * If it's just pointing past the last entry in left, then we'll + * insert there, so don't change anything in that case. + */ + if (cur->bc_ptrs[level] > lrecs + 1) { + xfs_btree_setbuf(cur, level, rbp); + cur->bc_ptrs[level] -= lrecs; + } + /* + * If there are more levels, we'll need another cursor which refers + * the right block, no matter where this cursor was. + */ + if (level + 1 < cur->bc_nlevels) { + error = xfs_btree_dup_cursor(cur, curp); + if (error) + goto error0; + (*curp)->bc_ptrs[level + 1]++; + } + *ptrp = rptr; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +struct xfs_btree_split_args { + struct xfs_btree_cur *cur; + int level; + union xfs_btree_ptr *ptrp; + union xfs_btree_key *key; + struct xfs_btree_cur **curp; + int *stat; /* success/failure */ + int result; + bool kswapd; /* allocation in kswapd context */ + struct completion *done; + struct work_struct work; +}; + +/* + * Stack switching interfaces for allocation + */ +static void +xfs_btree_split_worker( + struct work_struct *work) +{ + struct xfs_btree_split_args *args = container_of(work, + struct xfs_btree_split_args, work); + unsigned long pflags; + unsigned long new_pflags = PF_FSTRANS; + + /* + * we are in a transaction context here, but may also be doing work + * in kswapd context, and hence we may need to inherit that state + * temporarily to ensure that we don't block waiting for memory reclaim + * in any way. + */ + if (args->kswapd) + new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; + + current_set_flags_nested(&pflags, new_pflags); + + args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, + args->key, args->curp, args->stat); + complete(args->done); + + current_restore_flags_nested(&pflags, new_pflags); +} + +/* + * BMBT split requests often come in with little stack to work on. Push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. + */ +STATIC int /* error */ +xfs_btree_split( + struct xfs_btree_cur *cur, + int level, + union xfs_btree_ptr *ptrp, + union xfs_btree_key *key, + struct xfs_btree_cur **curp, + int *stat) /* success/failure */ +{ + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + + if (cur->bc_btnum != XFS_BTNUM_BMAP) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; + args.level = level; + args.ptrp = ptrp; + args.key = key; + args.curp = curp; + args.stat = stat; + args.done = &done; + args.kswapd = current_is_kswapd(); + INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker); + queue_work(xfs_alloc_wq, &args.work); + wait_for_completion(&done); + destroy_work_on_stack(&args.work); + return args.result; +} + + +/* + * Copy the old inode root contents into a real block and make the + * broot point to it. + */ +int /* error */ +xfs_btree_new_iroot( + struct xfs_btree_cur *cur, /* btree cursor */ + int *logflags, /* logging flags for inode */ + int *stat) /* return status - 0 fail */ +{ + struct xfs_buf *cbp; /* buffer for cblock */ + struct xfs_btree_block *block; /* btree block */ + struct xfs_btree_block *cblock; /* child btree block */ + union xfs_btree_key *ckp; /* child key pointer */ + union xfs_btree_ptr *cpp; /* child ptr pointer */ + union xfs_btree_key *kp; /* pointer to btree key */ + union xfs_btree_ptr *pp; /* pointer to block addr */ + union xfs_btree_ptr nptr; /* new block addr */ + int level; /* btree level */ + int error; /* error return code */ +#ifdef DEBUG + int i; /* loop counter */ +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + + level = cur->bc_nlevels - 1; + + block = xfs_btree_get_iroot(cur); + pp = xfs_btree_ptr_addr(cur, 1, block); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat); + if (error) + goto error0; + if (*stat == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; + } + XFS_BTREE_STATS_INC(cur, alloc); + + /* Copy the root into a real block. */ + error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp); + if (error) + goto error0; + + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ + memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } + + be16_add_cpu(&block->bb_level, 1); + xfs_btree_set_numrecs(block, 1); + cur->bc_nlevels++; + cur->bc_ptrs[level + 1] = 1; + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock)); + + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock)); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, &nptr, 0, level); + if (error) + goto error0; +#endif + xfs_btree_copy_ptrs(cur, pp, &nptr, 1); + + xfs_iroot_realloc(cur->bc_private.b.ip, + 1 - xfs_btree_get_numrecs(cblock), + cur->bc_private.b.whichfork); + + xfs_btree_setbuf(cur, level, cbp); + + /* + * Do all this logging at the end so that + * the root is at the right level. + */ + xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS); + xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs)); + + *logflags |= + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork); + *stat = 1; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Allocate a new root block, fill it in. + */ +STATIC int /* error */ +xfs_btree_new_root( + struct xfs_btree_cur *cur, /* btree cursor */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* one half of the old root block */ + struct xfs_buf *bp; /* buffer containing block */ + int error; /* error return value */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + struct xfs_buf *nbp; /* new (root) buffer */ + struct xfs_btree_block *new; /* new (root) btree block */ + int nptr; /* new value for key index, 1 or 2 */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + union xfs_btree_ptr rptr; + union xfs_btree_ptr lptr; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, newroot); + + /* initialise our start point from the cursor */ + cur->bc_ops->init_ptr_from_cur(cur, &rptr); + + /* Allocate the new block. If we can't do it, we're toast. Give up. */ + error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat); + if (error) + goto error0; + if (*stat == 0) + goto out0; + XFS_BTREE_STATS_INC(cur, alloc); + + /* Set up the new block. */ + error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp); + if (error) + goto error0; + + /* Set the root in the holding structure increasing the level by 1. */ + cur->bc_ops->set_root(cur, &lptr, 1); + + /* + * At the previous root level there are now two blocks: the old root, + * and the new block generated when it was split. We don't know which + * one the cursor is pointing at, so we set up variables "left" and + * "right" for each case. + */ + block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp); + if (error) + goto error0; +#endif + + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* Our block is left, pick up the right block. */ + lbp = bp; + xfs_btree_buf_to_ptr(cur, lbp, &lptr); + left = block; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + bp = rbp; + nptr = 1; + } else { + /* Our block is right, pick up the left block. */ + rbp = bp; + xfs_btree_buf_to_ptr(cur, rbp, &rptr); + right = block; + xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB); + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + bp = lbp; + nptr = 2; + } + /* Fill in the new block's btree header and log it. */ + xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2); + xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS); + ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) && + !xfs_btree_ptr_is_null(cur, &rptr)); + + /* Fill in the key data in the new root. */ + if (xfs_btree_get_level(left) > 0) { + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 1, new), + xfs_btree_key_addr(cur, 1, left), 1); + xfs_btree_copy_keys(cur, + xfs_btree_key_addr(cur, 2, new), + xfs_btree_key_addr(cur, 1, right), 1); + } else { + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 1, new), + xfs_btree_rec_addr(cur, 1, left)); + cur->bc_ops->init_key_from_rec( + xfs_btree_key_addr(cur, 2, new), + xfs_btree_rec_addr(cur, 1, right)); + } + xfs_btree_log_keys(cur, nbp, 1, 2); + + /* Fill in the pointer data in the new root. */ + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 1, new), &lptr, 1); + xfs_btree_copy_ptrs(cur, + xfs_btree_ptr_addr(cur, 2, new), &rptr, 1); + xfs_btree_log_ptrs(cur, nbp, 1, 2); + + /* Fix up the cursor. */ + xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); + cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_nlevels++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; +} + +STATIC int +xfs_btree_make_block_unfull( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* btree level */ + int numrecs,/* # of recs in block */ + int *oindex,/* old tree index */ + int *index, /* new tree index */ + union xfs_btree_ptr *nptr, /* new btree ptr */ + struct xfs_btree_cur **ncur, /* new btree cursor */ + union xfs_btree_rec *nrec, /* new record */ + int *stat) +{ + union xfs_btree_key key; /* new btree key value */ + int error = 0; + + if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + level == cur->bc_nlevels - 1) { + struct xfs_inode *ip = cur->bc_private.b.ip; + + if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) { + /* A root block that can be made bigger. */ + xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork); + } else { + /* A root block that needs replacing */ + int logflags = 0; + + error = xfs_btree_new_iroot(cur, &logflags, stat); + if (error || *stat == 0) + return error; + + xfs_trans_log_inode(cur->bc_tp, ip, logflags); + } + + return 0; + } + + /* First, try shifting an entry to the right neighbor. */ + error = xfs_btree_rshift(cur, level, stat); + if (error || *stat) + return error; + + /* Next, try shifting an entry to the left neighbor. */ + error = xfs_btree_lshift(cur, level, stat); + if (error) + return error; + + if (*stat) { + *oindex = *index = cur->bc_ptrs[level]; + return 0; + } + + /* + * Next, try splitting the current block in half. + * + * If this works we have to re-set our variables because we + * could be in a different block now. + */ + error = xfs_btree_split(cur, level, nptr, &key, ncur, stat); + if (error || *stat == 0) + return error; + + + *index = cur->bc_ptrs[level]; + cur->bc_ops->init_rec_from_key(&key, nrec); + return 0; +} + +/* + * Insert one record/level. Return information to the caller + * allowing the next level up to proceed if necessary. + */ +STATIC int +xfs_btree_insrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level to insert record at */ + union xfs_btree_ptr *ptrp, /* i/o: block number inserted */ + union xfs_btree_rec *recp, /* i/o: record data inserted */ + struct xfs_btree_cur **curp, /* output: new cursor replacing cur */ + int *stat) /* success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer for block */ + union xfs_btree_key key; /* btree key */ + union xfs_btree_ptr nptr; /* new block ptr */ + struct xfs_btree_cur *ncur; /* new btree cursor */ + union xfs_btree_rec nrec; /* new record count */ + int optr; /* old key/record index */ + int ptr; /* key/record index */ + int numrecs;/* number of records */ + int error; /* error return value */ +#ifdef DEBUG + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp); + + ncur = NULL; + + /* + * If we have an external root pointer, and we've made it to the + * root level, allocate a new root block and we're done. + */ + if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) && + (level >= cur->bc_nlevels)) { + error = xfs_btree_new_root(cur, stat); + xfs_btree_set_ptr_null(cur, ptrp); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return error; + } + + /* If we're off the left edge, return failure. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Make a key out of the record data to be inserted, and save it. */ + cur->bc_ops->init_key_from_rec(&key, recp); + + optr = ptr; + + XFS_BTREE_STATS_INC(cur, insrec); + + /* Get pointers to the btree buffer and block. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; + + /* Check that the new entry is being inserted in the right place. */ + if (ptr <= numrecs) { + if (level == 0) { + ASSERT(cur->bc_ops->recs_inorder(cur, recp, + xfs_btree_rec_addr(cur, ptr, block))); + } else { + ASSERT(cur->bc_ops->keys_inorder(cur, &key, + xfs_btree_key_addr(cur, ptr, block))); + } + } +#endif + + /* + * If the block is full, we can't insert the new entry until we + * make the block un-full. + */ + xfs_btree_set_ptr_null(cur, &nptr); + if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) { + error = xfs_btree_make_block_unfull(cur, level, numrecs, + &optr, &ptr, &nptr, &ncur, &nrec, stat); + if (error || *stat == 0) + goto error0; + } + + /* + * The current block may have changed if the block was + * previously full and we have just made space in it. + */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + return error; +#endif + + /* + * At this point we know there's room for our new entry in the block + * we're pointing at. + */ + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1); + + if (level > 0) { + /* It's a nonleaf. make a hole in the keys and ptrs */ + union xfs_btree_key *kp; + union xfs_btree_ptr *pp; + + kp = xfs_btree_key_addr(cur, ptr, block); + pp = xfs_btree_ptr_addr(cur, ptr, block); + +#ifdef DEBUG + for (i = numrecs - ptr; i >= 0; i--) { + error = xfs_btree_check_ptr(cur, pp, i, level); + if (error) + return error; + } +#endif + + xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); + xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1); + +#ifdef DEBUG + error = xfs_btree_check_ptr(cur, ptrp, 0, level); + if (error) + goto error0; +#endif + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_keys(cur, kp, &key, 1); + xfs_btree_copy_ptrs(cur, pp, ptrp, 1); + numrecs++; + xfs_btree_set_numrecs(block, numrecs); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs); + xfs_btree_log_keys(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->keys_inorder(cur, kp, + xfs_btree_key_addr(cur, ptr + 1, block))); + } +#endif + } else { + /* It's a leaf. make a hole in the records */ + union xfs_btree_rec *rp; + + rp = xfs_btree_rec_addr(cur, ptr, block); + + xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1); + + /* Now put the new data in, bump numrecs and log it. */ + xfs_btree_copy_recs(cur, rp, recp, 1); + xfs_btree_set_numrecs(block, ++numrecs); + xfs_btree_log_recs(cur, bp, ptr, numrecs); +#ifdef DEBUG + if (ptr < numrecs) { + ASSERT(cur->bc_ops->recs_inorder(cur, rp, + xfs_btree_rec_addr(cur, ptr + 1, block))); + } +#endif + } + + /* Log the new number of records in the btree header. */ + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* If we inserted at the start of a block, update the parents' keys. */ + if (optr == 1) { + error = xfs_btree_updkey(cur, &key, level + 1); + if (error) + goto error0; + } + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, recp, + ptr, LASTREC_INSREC); + } + + /* + * Return the new block number, if any. + * If there is one, give back a record value and a cursor too. + */ + *ptrp = nptr; + if (!xfs_btree_ptr_is_null(cur, &nptr)) { + *recp = nrec; + *curp = ncur; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Insert the record at the point referenced by cur. + * + * A multi-level split of the tree on insert will invalidate the original + * cursor. All callers of this function should assume that the cursor is + * no longer valid and revalidate it. + */ +int +xfs_btree_insert( + struct xfs_btree_cur *cur, + int *stat) +{ + int error; /* error return value */ + int i; /* result value, 0 for failure */ + int level; /* current level number in btree */ + union xfs_btree_ptr nptr; /* new block number (split result) */ + struct xfs_btree_cur *ncur; /* new cursor (split result) */ + struct xfs_btree_cur *pcur; /* previous level's cursor */ + union xfs_btree_rec rec; /* record to insert */ + + level = 0; + ncur = NULL; + pcur = cur; + + xfs_btree_set_ptr_null(cur, &nptr); + cur->bc_ops->init_rec_from_cur(cur, &rec); + + /* + * Loop going up the tree, starting at the leaf level. + * Stop when we don't get a split block, that must mean that + * the insert is finished with this level. + */ + do { + /* + * Insert nrec/nptr into this level of the tree. + * Note if we fail, nptr will be null. + */ + error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i); + if (error) { + if (pcur != cur) + xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); + goto error0; + } + + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + level++; + + /* + * See if the cursor we just used is trash. + * Can't trash the caller's cursor, but otherwise we should + * if ncur is a new cursor or we're about to be done. + */ + if (pcur != cur && + (ncur || xfs_btree_ptr_is_null(cur, &nptr))) { + /* Save the state from the cursor before we trash it */ + if (cur->bc_ops->update_cursor) + cur->bc_ops->update_cursor(pcur, cur); + cur->bc_nlevels = pcur->bc_nlevels; + xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); + } + /* If we got a new cursor, switch to it. */ + if (ncur) { + pcur = ncur; + ncur = NULL; + } + } while (!xfs_btree_ptr_is_null(cur, &nptr)); + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Try to merge a non-leaf block back into the inode root. + * + * Note: the killroot names comes from the fact that we're effectively + * killing the old root block. But because we can't just delete the + * inode we have to copy the single block it was pointing to into the + * inode. + */ +STATIC int +xfs_btree_kill_iroot( + struct xfs_btree_cur *cur) +{ + int whichfork = cur->bc_private.b.whichfork; + struct xfs_inode *ip = cur->bc_private.b.ip; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_btree_block *block; + struct xfs_btree_block *cblock; + union xfs_btree_key *kp; + union xfs_btree_key *ckp; + union xfs_btree_ptr *pp; + union xfs_btree_ptr *cpp; + struct xfs_buf *cbp; + int level; + int index; + int numrecs; +#ifdef DEBUG + union xfs_btree_ptr ptr; + int i; +#endif + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(cur->bc_nlevels > 1); + + /* + * Don't deal with the root block needs to be a leaf case. + * We're just going to turn the thing back into extents anyway. + */ + level = cur->bc_nlevels - 1; + if (level == 1) + goto out0; + + /* + * Give up if the root has multiple children. + */ + block = xfs_btree_get_iroot(cur); + if (xfs_btree_get_numrecs(block) != 1) + goto out0; + + cblock = xfs_btree_get_block(cur, level - 1, &cbp); + numrecs = xfs_btree_get_numrecs(cblock); + + /* + * Only do this if the next level will fit. + * Then the data must be copied up to the inode, + * instead of freeing the root you free the next level. + */ + if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level)) + goto out0; + + XFS_BTREE_STATS_INC(cur, killroot); + +#ifdef DEBUG + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); + xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB); + ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); +#endif + + index = numrecs - cur->bc_ops->get_maxrecs(cur, level); + if (index) { + xfs_iroot_realloc(cur->bc_private.b.ip, index, + cur->bc_private.b.whichfork); + block = ifp->if_broot; + } + + be16_add_cpu(&block->bb_numrecs, index); + ASSERT(block->bb_numrecs == cblock->bb_numrecs); + + kp = xfs_btree_key_addr(cur, 1, block); + ckp = xfs_btree_key_addr(cur, 1, cblock); + xfs_btree_copy_keys(cur, kp, ckp, numrecs); + + pp = xfs_btree_ptr_addr(cur, 1, block); + cpp = xfs_btree_ptr_addr(cur, 1, cblock); +#ifdef DEBUG + for (i = 0; i < numrecs; i++) { + int error; + + error = xfs_btree_check_ptr(cur, cpp, i, level - 1); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + } +#endif + xfs_btree_copy_ptrs(cur, pp, cpp, numrecs); + + cur->bc_ops->free_block(cur, cbp); + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level - 1] = NULL; + be16_add_cpu(&block->bb_level, -1); + xfs_trans_log_inode(cur->bc_tp, ip, + XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork)); + cur->bc_nlevels--; +out0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +/* + * Kill the current root node, and replace it with it's only child node. + */ +STATIC int +xfs_btree_kill_root( + struct xfs_btree_cur *cur, + struct xfs_buf *bp, + int level, + union xfs_btree_ptr *newroot) +{ + int error; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_STATS_INC(cur, killroot); + + /* + * Update the root pointer, decreasing the level by 1 and then + * free the old root. + */ + cur->bc_ops->set_root(cur, newroot, -1); + + error = cur->bc_ops->free_block(cur, bp); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; + } + + XFS_BTREE_STATS_INC(cur, free); + + cur->bc_bufs[level] = NULL; + cur->bc_ra[level] = 0; + cur->bc_nlevels--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + return 0; +} + +STATIC int +xfs_btree_dec_cursor( + struct xfs_btree_cur *cur, + int level, + int *stat) +{ + int error; + int i; + + if (level > 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + return error; + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; +} + +/* + * Single level of the btree record deletion routine. + * Delete record pointed to by cur/level. + * Remove the record from its block then rebalance the tree. + * Return 0 for error, 1 for done, 2 to go on to the next level. + */ +STATIC int /* error */ +xfs_btree_delrec( + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level removing record from */ + int *stat) /* fail/done/go-on */ +{ + struct xfs_btree_block *block; /* btree block */ + union xfs_btree_ptr cptr; /* current block ptr */ + struct xfs_buf *bp; /* buffer for block */ + int error; /* error return value */ + int i; /* loop counter */ + union xfs_btree_key key; /* storage for keyp */ + union xfs_btree_key *keyp = &key; /* passed to the next level */ + union xfs_btree_ptr lptr; /* left sibling block ptr */ + struct xfs_buf *lbp; /* left buffer pointer */ + struct xfs_btree_block *left; /* left btree block */ + int lrecs = 0; /* left record count */ + int ptr; /* key/record index */ + union xfs_btree_ptr rptr; /* right sibling block ptr */ + struct xfs_buf *rbp; /* right buffer pointer */ + struct xfs_btree_block *right; /* right btree block */ + struct xfs_btree_block *rrblock; /* right-right btree block */ + struct xfs_buf *rrbp; /* right-right buffer pointer */ + int rrecs = 0; /* right record count */ + struct xfs_btree_cur *tcur; /* temporary btree cursor */ + int numrecs; /* temporary numrec count */ + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + XFS_BTREE_TRACE_ARGI(cur, level); + + tcur = NULL; + + /* Get the index of the entry being deleted, check for nothing there. */ + ptr = cur->bc_ptrs[level]; + if (ptr == 0) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + /* Get the buffer & block containing the record or key/ptr. */ + block = xfs_btree_get_block(cur, level, &bp); + numrecs = xfs_btree_get_numrecs(block); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, level, bp); + if (error) + goto error0; +#endif + + /* Fail if we're off the end of the block. */ + if (ptr > numrecs) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 0; + return 0; + } + + XFS_BTREE_STATS_INC(cur, delrec); + XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr); + + /* Excise the entries being deleted. */ + if (level > 0) { + /* It's a nonleaf. operate on keys and ptrs */ + union xfs_btree_key *lkp; + union xfs_btree_ptr *lpp; + + lkp = xfs_btree_key_addr(cur, ptr + 1, block); + lpp = xfs_btree_ptr_addr(cur, ptr + 1, block); + +#ifdef DEBUG + for (i = 0; i < numrecs - ptr; i++) { + error = xfs_btree_check_ptr(cur, lpp, i, level); + if (error) + goto error0; + } +#endif + + if (ptr < numrecs) { + xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr); + xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr); + xfs_btree_log_keys(cur, bp, ptr, numrecs - 1); + xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need to pass a + * key up to the next level (updkey). + */ + if (ptr == 1) + keyp = xfs_btree_key_addr(cur, 1, block); + } else { + /* It's a leaf. operate on records */ + if (ptr < numrecs) { + xfs_btree_shift_recs(cur, + xfs_btree_rec_addr(cur, ptr + 1, block), + -1, numrecs - ptr); + xfs_btree_log_recs(cur, bp, ptr, numrecs - 1); + } + + /* + * If it's the first record in the block, we'll need a key + * structure to pass up to the next level (updkey). + */ + if (ptr == 1) { + cur->bc_ops->init_key_from_rec(&key, + xfs_btree_rec_addr(cur, 1, block)); + keyp = &key; + } + } + + /* + * Decrement and log the number of entries in the block. + */ + xfs_btree_set_numrecs(block, --numrecs); + xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS); + + /* + * If we are tracking the last record in the tree and + * we are at the far right edge of the tree, update it. + */ + if (xfs_btree_is_lastrec(cur, block, level)) { + cur->bc_ops->update_lastrec(cur, block, NULL, + ptr, LASTREC_DELREC); + } + + /* + * We're at the root level. First, shrink the root block in-memory. + * Try to get rid of the next level down. If we can't then there's + * nothing left to do. + */ + if (level == cur->bc_nlevels - 1) { + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + xfs_iroot_realloc(cur->bc_private.b.ip, -1, + cur->bc_private.b.whichfork); + + error = xfs_btree_kill_iroot(cur); + if (error) + goto error0; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + *stat = 1; + return 0; + } + + /* + * If this is the root level, and there's only one entry left, + * and it's NOT the leaf level, then we can get rid of this + * level. + */ + if (numrecs == 1 && level > 0) { + union xfs_btree_ptr *pp; + /* + * pp is still set to the first pointer in the block. + * Make it the new root of the btree. + */ + pp = xfs_btree_ptr_addr(cur, 1, block); + error = xfs_btree_kill_root(cur, bp, level, pp); + if (error) + goto error0; + } else if (level > 0) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + } + *stat = 1; + return 0; + } + + /* + * If we deleted the leftmost entry in the block, update the + * key values above us in the tree. + */ + if (ptr == 1) { + error = xfs_btree_updkey(cur, keyp, level + 1); + if (error) + goto error0; + } + + /* + * If the number of records remaining in the block is at least + * the minimum, we're done. + */ + if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + /* + * Otherwise, we have to move some records around to keep the + * tree balanced. Look at the left and right sibling blocks to + * see if we can re-balance by moving only one record. + */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB); + + if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) { + /* + * One child of root, need to get a chance to copy its contents + * into the root and delete it. Can't go up to next level, + * there's nothing to delete there. + */ + if (xfs_btree_ptr_is_null(cur, &rptr) && + xfs_btree_ptr_is_null(cur, &lptr) && + level == cur->bc_nlevels - 2) { + error = xfs_btree_kill_iroot(cur); + if (!error) + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) || + !xfs_btree_ptr_is_null(cur, &lptr)); + + /* + * Duplicate the cursor so our btree manipulations here won't + * disrupt the next level up. + */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * If there's a right sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &rptr)) { + /* + * Move the temp cursor to the last entry in the next block. + * Actually any entry but the first would suffice. + */ + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_increment(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + i = xfs_btree_lastrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + right = xfs_btree_get_block(tcur, level, &rbp); +#ifdef DEBUG + error = xfs_btree_check_block(tcur, right, level, rbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB); + + /* + * If right block is full enough so that removing one entry + * won't make it too empty, and left-shifting an entry out + * of right to us works, we're done. + */ + if (xfs_btree_get_numrecs(right) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_lshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference, and fix up the temp cursor to point + * to our block again (last record). + */ + rrecs = xfs_btree_get_numrecs(right); + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + } + + /* + * If there's a left sibling, see if it's ok to shift an entry + * out of it. + */ + if (!xfs_btree_ptr_is_null(cur, &lptr)) { + /* + * Move the temp cursor to the first entry in the + * previous block. + */ + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_btree_decrement(tcur, level, &i); + if (error) + goto error0; + i = xfs_btree_firstrec(tcur, level); + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + /* Grab a pointer to the block. */ + left = xfs_btree_get_block(tcur, level, &lbp); +#ifdef DEBUG + error = xfs_btree_check_block(cur, left, level, lbp); + if (error) + goto error0; +#endif + /* Grab the current block number, for future use. */ + xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB); + + /* + * If left block is full enough so that removing one entry + * won't make it too empty, and right-shifting an entry out + * of left to us works, we're done. + */ + if (xfs_btree_get_numrecs(left) - 1 >= + cur->bc_ops->get_minrecs(tcur, level)) { + error = xfs_btree_rshift(tcur, level, &i); + if (error) + goto error0; + if (i) { + ASSERT(xfs_btree_get_numrecs(block) >= + cur->bc_ops->get_minrecs(tcur, level)); + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + if (level == 0) + cur->bc_ptrs[0]++; + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = 1; + return 0; + } + } + + /* + * Otherwise, grab the number of records in right for + * future reference. + */ + lrecs = xfs_btree_get_numrecs(left); + } + + /* Delete the temp cursor, we're done with it. */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + tcur = NULL; + + /* If here, we need to do a join to keep the tree balanced. */ + ASSERT(!xfs_btree_ptr_is_null(cur, &cptr)); + + if (!xfs_btree_ptr_is_null(cur, &lptr) && + lrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "right" to be the starting block, + * "left" to be the left neighbor. + */ + rptr = cptr; + right = block; + rbp = bp; + error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp); + if (error) + goto error0; + + /* + * If that won't work, see if we can join with the right neighbor block. + */ + } else if (!xfs_btree_ptr_is_null(cur, &rptr) && + rrecs + xfs_btree_get_numrecs(block) <= + cur->bc_ops->get_maxrecs(cur, level)) { + /* + * Set "left" to be the starting block, + * "right" to be the right neighbor. + */ + lptr = cptr; + left = block; + lbp = bp; + error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp); + if (error) + goto error0; + + /* + * Otherwise, we can't fix the imbalance. + * Just return. This is probably a logic error, but it's not fatal. + */ + } else { + error = xfs_btree_dec_cursor(cur, level, stat); + if (error) + goto error0; + return 0; + } + + rrecs = xfs_btree_get_numrecs(right); + lrecs = xfs_btree_get_numrecs(left); + + /* + * We're now going to join "left" and "right" by moving all the stuff + * in "right" to "left" and deleting "right". + */ + XFS_BTREE_STATS_ADD(cur, moves, rrecs); + if (level > 0) { + /* It's a non-leaf. Move keys and pointers. */ + union xfs_btree_key *lkp; /* left btree key */ + union xfs_btree_ptr *lpp; /* left address pointer */ + union xfs_btree_key *rkp; /* right btree key */ + union xfs_btree_ptr *rpp; /* right address pointer */ + + lkp = xfs_btree_key_addr(cur, lrecs + 1, left); + lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left); + rkp = xfs_btree_key_addr(cur, 1, right); + rpp = xfs_btree_ptr_addr(cur, 1, right); +#ifdef DEBUG + for (i = 1; i < rrecs; i++) { + error = xfs_btree_check_ptr(cur, rpp, i, level); + if (error) + goto error0; + } +#endif + xfs_btree_copy_keys(cur, lkp, rkp, rrecs); + xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs); + + xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); + } else { + /* It's a leaf. Move records. */ + union xfs_btree_rec *lrp; /* left record pointer */ + union xfs_btree_rec *rrp; /* right record pointer */ + + lrp = xfs_btree_rec_addr(cur, lrecs + 1, left); + rrp = xfs_btree_rec_addr(cur, 1, right); + + xfs_btree_copy_recs(cur, lrp, rrp, rrecs); + xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + + XFS_BTREE_STATS_INC(cur, join); + + /* + * Fix up the number of records and right block pointer in the + * surviving block, and log it. + */ + xfs_btree_set_numrecs(left, lrecs + rrecs); + xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB), + xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); + + /* If there is a right sibling, point it to the remaining block. */ + xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB); + if (!xfs_btree_ptr_is_null(cur, &cptr)) { + error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp); + if (error) + goto error0; + xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB); + xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB); + } + + /* Free the deleted block. */ + error = cur->bc_ops->free_block(cur, rbp); + if (error) + goto error0; + XFS_BTREE_STATS_INC(cur, free); + + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + cur->bc_bufs[level] = lbp; + cur->bc_ptrs[level] += lrecs; + cur->bc_ra[level] = 0; + } + /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || + (level + 1 < cur->bc_nlevels)) { + error = xfs_btree_increment(cur, level + 1, &i); + if (error) + goto error0; + } + + /* + * Readjust the ptr at this level if it's not a leaf, since it's + * still pointing at the deletion point, which makes the cursor + * inconsistent. If this makes the ptr 0, the caller fixes it up. + * We can't use decrement because it would change the next level up. + */ + if (level > 0) + cur->bc_ptrs[level]--; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + /* Return value means the next level up has something to do. */ + *stat = 2; + return 0; + +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + if (tcur) + xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Delete the record pointed to by cur. + * The cursor refers to the place where the record was (could be inserted) + * when the operation returns. + */ +int /* error */ +xfs_btree_delete( + struct xfs_btree_cur *cur, + int *stat) /* success/failure */ +{ + int error; /* error return value */ + int level; + int i; + + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); + + /* + * Go up the tree, starting at leaf level. + * + * If 2 is returned then a join was done; go to the next level. + * Otherwise we are done. + */ + for (level = 0, i = 2; i == 2; level++) { + error = xfs_btree_delrec(cur, level, &i); + if (error) + goto error0; + } + + if (i == 0) { + for (level = 1; level < cur->bc_nlevels; level++) { + if (cur->bc_ptrs[level] == 0) { + error = xfs_btree_decrement(cur, level, &i); + if (error) + goto error0; + break; + } + } + } + + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + *stat = i; + return 0; +error0: + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); + return error; +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_btree_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + union xfs_btree_rec **recp, /* output: btree record */ + int *stat) /* output: success/failure */ +{ + struct xfs_btree_block *block; /* btree block */ + struct xfs_buf *bp; /* buffer pointer */ + int ptr; /* record number */ +#ifdef DEBUG + int error; /* error return value */ +#endif + + ptr = cur->bc_ptrs[0]; + block = xfs_btree_get_block(cur, 0, &bp); + +#ifdef DEBUG + error = xfs_btree_check_block(cur, block, 0, bp); + if (error) + return error; +#endif + + /* + * Off the right end or left end, return failure. + */ + if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) { + *stat = 0; + return 0; + } + + /* + * Point to the record and extract its data. + */ + *recp = xfs_btree_rec_addr(cur, ptr, block); + *stat = 1; + return 0; +} + +/* + * Change the owner of a btree. + * + * The mechanism we use here is ordered buffer logging. Because we don't know + * how many buffers were are going to need to modify, we don't really want to + * have to make transaction reservations for the worst case of every buffer in a + * full size btree as that may be more space that we can fit in the log.... + * + * We do the btree walk in the most optimal manner possible - we have sibling + * pointers so we can just walk all the blocks on each level from left to right + * in a single pass, and then move to the next level and do the same. We can + * also do readahead on the sibling pointers to get IO moving more quickly, + * though for slow disks this is unlikely to make much difference to performance + * as the amount of CPU work we have to do before moving to the next block is + * relatively small. + * + * For each btree block that we load, modify the owner appropriately, set the + * buffer as an ordered buffer and log it appropriately. We need to ensure that + * we mark the region we change dirty so that if the buffer is relogged in + * a subsequent transaction the changes we make here as an ordered buffer are + * correctly relogged in that transaction. If we are in recovery context, then + * just queue the modified buffer as delayed write buffer so the transaction + * recovery completion writes the changes to disk. + */ +static int +xfs_btree_block_change_owner( + struct xfs_btree_cur *cur, + int level, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + union xfs_btree_ptr rptr; + + /* do right sibling readahead */ + xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); + + /* modify the owner */ + block = xfs_btree_get_block(cur, level, &bp); + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + block->bb_u.l.bb_owner = cpu_to_be64(new_owner); + else + block->bb_u.s.bb_owner = cpu_to_be32(new_owner); + + /* + * If the block is a root block hosted in an inode, we might not have a + * buffer pointer here and we shouldn't attempt to log the change as the + * information is already held in the inode and discarded when the root + * block is formatted into the on-disk inode fork. We still change it, + * though, so everything is consistent in memory. + */ + if (bp) { + if (cur->bc_tp) { + xfs_trans_ordered_buf(cur->bc_tp, bp); + xfs_btree_log_block(cur, bp, XFS_BB_OWNER); + } else { + xfs_buf_delwri_queue(bp, buffer_list); + } + } else { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + } + + /* now read rh sibling block for next iteration */ + xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); + if (xfs_btree_ptr_is_null(cur, &rptr)) + return ENOENT; + + return xfs_btree_lookup_get_block(cur, level, &rptr, &block); +} + +int +xfs_btree_change_owner( + struct xfs_btree_cur *cur, + __uint64_t new_owner, + struct list_head *buffer_list) +{ + union xfs_btree_ptr lptr; + int level; + struct xfs_btree_block *block = NULL; + int error = 0; + + cur->bc_ops->init_ptr_from_cur(cur, &lptr); + + /* for each level */ + for (level = cur->bc_nlevels - 1; level >= 0; level--) { + /* grab the left hand block */ + error = xfs_btree_lookup_get_block(cur, level, &lptr, &block); + if (error) + return error; + + /* readahead the left most block for the next level down */ + if (level > 0) { + union xfs_btree_ptr *ptr; + + ptr = xfs_btree_ptr_addr(cur, 1, block); + xfs_btree_readahead_ptr(cur, ptr, 1); + + /* save for the next iteration of the loop */ + lptr = *ptr; + } + + /* for each buffer in the level */ + do { + error = xfs_btree_block_change_owner(cur, level, + new_owner, + buffer_list); + } while (!error); + + if (error != ENOENT) + return error; + } + + return 0; +} diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h index 44f1bd98064..a04b69422f6 100644 --- a/fs/xfs/xfs_btree.h +++ b/fs/xfs/xfs_btree.h @@ -24,6 +24,33 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; +extern kmem_zone_t *xfs_btree_cur_zone; + +/* + * Generic key, ptr and record wrapper structures. + * + * These are disk format structures, and are converted where necessary + * by the btree specific code that needs to interpret them. + */ +union xfs_btree_ptr { + __be32 s; /* short form ptr */ + __be64 l; /* long form ptr */ +}; + +union xfs_btree_key { + xfs_bmbt_key_t bmbt; + xfs_bmdr_key_t bmbr; /* bmbt root block */ + xfs_alloc_key_t alloc; + xfs_inobt_key_t inobt; +}; + +union xfs_btree_rec { + xfs_bmbt_rec_t bmbt; + xfs_bmdr_rec_t bmbr; /* bmbt root block */ + xfs_alloc_rec_t alloc; + xfs_inobt_rec_t inobt; +}; + /* * This nonsense is to make -wlint happy. */ @@ -35,104 +62,128 @@ struct xfs_trans; #define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi) #define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi) #define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi) +#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi) /* - * Short form header: space allocation btrees. + * For logging record fields. */ -typedef struct xfs_btree_sblock { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - __be32 bb_leftsib; /* left sibling block or NULLAGBLOCK */ - __be32 bb_rightsib; /* right sibling block or NULLAGBLOCK */ -} xfs_btree_sblock_t; +#define XFS_BB_MAGIC (1 << 0) +#define XFS_BB_LEVEL (1 << 1) +#define XFS_BB_NUMRECS (1 << 2) +#define XFS_BB_LEFTSIB (1 << 3) +#define XFS_BB_RIGHTSIB (1 << 4) +#define XFS_BB_BLKNO (1 << 5) +#define XFS_BB_LSN (1 << 6) +#define XFS_BB_UUID (1 << 7) +#define XFS_BB_OWNER (1 << 8) +#define XFS_BB_NUM_BITS 5 +#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) +#define XFS_BB_NUM_BITS_CRC 9 +#define XFS_BB_ALL_BITS_CRC ((1 << XFS_BB_NUM_BITS_CRC) - 1) + +/* + * Generic stats interface + */ +#define __XFS_BTREE_STATS_INC(type, stat) \ + XFS_STATS_INC(xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) + +#define __XFS_BTREE_STATS_ADD(type, stat, val) \ + XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ +do { \ + switch (cur->bc_btnum) { \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + } \ +} while (0) -/* - * Long form header: bmap btrees. - */ -typedef struct xfs_btree_lblock { - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ - __be64 bb_leftsib; /* left sibling block or NULLDFSBNO */ - __be64 bb_rightsib; /* right sibling block or NULLDFSBNO */ -} xfs_btree_lblock_t; +#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ -/* - * Combined header and structure, used by common code. - */ -typedef struct xfs_btree_hdr -{ - __be32 bb_magic; /* magic number for block type */ - __be16 bb_level; /* 0 is a leaf */ - __be16 bb_numrecs; /* current # of data records */ -} xfs_btree_hdr_t; +struct xfs_btree_ops { + /* size of the key and record structures */ + size_t key_len; + size_t rec_len; -typedef struct xfs_btree_block { - xfs_btree_hdr_t bb_h; /* header */ - union { - struct { - __be32 bb_leftsib; - __be32 bb_rightsib; - } s; /* short form pointers */ - struct { - __be64 bb_leftsib; - __be64 bb_rightsib; - } l; /* long form pointers */ - } bb_u; /* rest */ -} xfs_btree_block_t; + /* cursor operations */ + struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *); + void (*update_cursor)(struct xfs_btree_cur *src, + struct xfs_btree_cur *dst); -/* - * For logging record fields. - */ -#define XFS_BB_MAGIC 0x01 -#define XFS_BB_LEVEL 0x02 -#define XFS_BB_NUMRECS 0x04 -#define XFS_BB_LEFTSIB 0x08 -#define XFS_BB_RIGHTSIB 0x10 -#define XFS_BB_NUM_BITS 5 -#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1) + /* update btree root pointer */ + void (*set_root)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, int level_change); -/* - * Boolean to select which form of xfs_btree_block_t.bb_u to use. - */ -#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP) + /* block allocation / freeing */ + int (*alloc_block)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat); + int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); -/* - * Magic numbers for btree blocks. - */ -extern const __uint32_t xfs_magics[]; + /* update last record information */ + void (*update_lastrec)(struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + union xfs_btree_rec *rec, + int ptr, int reason); -/* - * Maximum and minimum records in a btree block. - * Given block size, type prefix, and leaf flag (0 or 1). - * The divisor below is equivalent to lf ? (e1) : (e2) but that produces - * compiler warnings. - */ -#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \ - ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \ - (((lf) * (uint)sizeof(t ## _rec_t)) + \ - ((1 - (lf)) * \ - ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t)))))) -#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \ - (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2) + /* records in block/level */ + int (*get_minrecs)(struct xfs_btree_cur *cur, int level); + int (*get_maxrecs)(struct xfs_btree_cur *cur, int level); + + /* records on disk. Matter for the root in inode case. */ + int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level); + + /* init values of btree structures */ + void (*init_key_from_rec)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_key)(union xfs_btree_key *key, + union xfs_btree_rec *rec); + void (*init_rec_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_rec *rec); + void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr); + + /* difference between key value and cursor value */ + __int64_t (*key_diff)(struct xfs_btree_cur *cur, + union xfs_btree_key *key); + + const struct xfs_buf_ops *buf_ops; + +#if defined(DEBUG) || defined(XFS_WARN) + /* check that k1 is lower than k2 */ + int (*keys_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2); + + /* check that r1 is lower than r2 */ + int (*recs_inorder)(struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2); +#endif +}; /* - * Record, key, and pointer address calculation macros. - * Given block size, type prefix, block pointer, and index of requested entry - * (first entry numbered 1). + * Reasons for the update_lastrec method to be called. */ -#define XFS_BTREE_REC_ADDR(bsz,t,bb,i,mxr) \ - ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - ((i) - 1) * sizeof(t ## _rec_t))) -#define XFS_BTREE_KEY_ADDR(bsz,t,bb,i,mxr) \ - ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - ((i) - 1) * sizeof(t ## _key_t))) -#define XFS_BTREE_PTR_ADDR(bsz,t,bb,i,mxr) \ - ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \ - (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t))) +#define LASTREC_UPDATE 0 +#define LASTREC_INSREC 1 +#define LASTREC_DELREC 2 -#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */ /* * Btree cursor structure. @@ -142,10 +193,12 @@ typedef struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ + const struct xfs_btree_ops *bc_ops; + uint bc_flags; /* btree features - below */ union { xfs_alloc_rec_incore_t a; xfs_bmbt_irec_t b; - xfs_inobt_rec_t i; + xfs_inobt_rec_incore_t i; } bc_rec; /* current insert/search record value */ struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ @@ -156,8 +209,8 @@ typedef struct xfs_btree_cur __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ union { - struct { /* needed for BNO, CNT */ - struct xfs_buf *agbp; /* agf buffer pointer */ + struct { /* needed for BNO, CNT, INO */ + struct xfs_buf *agbp; /* agf/agi buffer pointer */ xfs_agnumber_t agno; /* ag number */ } a; struct { /* needed for BMAP */ @@ -170,99 +223,45 @@ typedef struct xfs_btree_cur char flags; /* flags */ #define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ } b; - struct { /* needed for INO */ - struct xfs_buf *agbp; /* agi buffer pointer */ - xfs_agnumber_t agno; /* ag number */ - } i; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; +/* cursor flags */ +#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ +#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */ +#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */ +#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */ + + #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 /* * Convert from buffer to btree block header. */ -#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)XFS_BUF_PTR(bp)) -#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp)) -#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)((bp)->b_addr)) -#ifdef __KERNEL__ - -#ifdef DEBUG /* - * Debug routine: check that block header is ok. + * Check that block header is ok. */ -void +int xfs_btree_check_block( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_block_t *block, /* generic btree block pointer */ + struct xfs_btree_cur *cur, /* btree cursor */ + struct xfs_btree_block *block, /* generic btree block pointer */ int level, /* level of the btree block */ struct xfs_buf *bp); /* buffer containing block, if any */ /* - * Debug routine: check that keys are in the right order. - */ -void -xfs_btree_check_key( - xfs_btnum_t btnum, /* btree identifier */ - void *ak1, /* pointer to left (lower) key */ - void *ak2); /* pointer to right (higher) key */ - -/* - * Debug routine: check that records are in the right order. - */ -void -xfs_btree_check_rec( - xfs_btnum_t btnum, /* btree identifier */ - void *ar1, /* pointer to left (lower) record */ - void *ar2); /* pointer to right (higher) record */ -#else -#define xfs_btree_check_block(a,b,c,d) -#define xfs_btree_check_key(a,b,c) -#define xfs_btree_check_rec(a,b,c) -#endif /* DEBUG */ - -/* - * Checking routine: check that long form block header is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_lblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_lblock_t *block, /* btree long form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp); /* buffer containing block, if any */ - -/* - * Checking routine: check that (long) pointer is ok. + * Check that (long) pointer is ok. */ int /* error (0 or EFSCORRUPTED) */ xfs_btree_check_lptr( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ xfs_dfsbno_t ptr, /* btree block disk address */ int level); /* btree block level */ /* - * Checking routine: check that short form block header is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sblock( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_btree_sblock_t *block, /* btree short form block pointer */ - int level, /* level of the btree block */ - struct xfs_buf *bp); /* buffer containing block */ - -/* - * Checking routine: check that (short) pointer is ok. - */ -int /* error (0 or EFSCORRUPTED) */ -xfs_btree_check_sptr( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agblock_t ptr, /* btree block disk address */ - int level); /* btree block level */ - -/* * Delete the btree cursor. */ void @@ -280,15 +279,6 @@ xfs_btree_dup_cursor( xfs_btree_cur_t **ncur);/* output cursor */ /* - * Change the cursor to point to the first record in the current block - * at the given level. Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to change */ - -/* * Get a buffer for the block, return it with no data read. * Long-form addressing. */ @@ -312,20 +302,6 @@ xfs_btree_get_bufs( uint lock); /* lock flags for get_buf */ /* - * Allocate a new btree cursor. - * The cursor is either for allocation (A) or bmap (B). - */ -xfs_btree_cur_t * /* new btree cursor */ -xfs_btree_init_cursor( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - struct xfs_buf *agbp, /* (A only) buffer for agf structure */ - xfs_agnumber_t agno, /* (A only) allocation group number */ - xfs_btnum_t btnum, /* btree identifier */ - struct xfs_inode *ip, /* (B only) inode owning the btree */ - int whichfork); /* (B only) data/attr fork */ - -/* * Check for the cursor referring to the last block at the given level. */ int /* 1=is last block, 0=not last block */ @@ -334,15 +310,6 @@ xfs_btree_islastblock( int level); /* level to check */ /* - * Change the cursor to point to the last record in the current block - * at the given level. Other levels are unaffected. - */ -int /* success=1, failure=0 */ -xfs_btree_lastrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level); /* level to change */ - -/* * Compute first and last byte offsets for the fields given. * Interprets the offsets table, which contains struct field offsets. */ @@ -365,21 +332,8 @@ xfs_btree_read_bufl( xfs_fsblock_t fsbno, /* file system block number */ uint lock, /* lock flags for read_buf */ struct xfs_buf **bpp, /* buffer for fsbno */ - int refval);/* ref count value for buffer */ - -/* - * Get a buffer for the block, return it read in. - * Short-form addressing. - */ -int /* error */ -xfs_btree_read_bufs( - struct xfs_mount *mp, /* file system mount point */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_agblock_t agbno, /* allocation group block number */ - uint lock, /* lock flags for read_buf */ - struct xfs_buf **bpp, /* buffer for agno/agbno */ - int refval);/* ref count value for buffer */ + int refval, /* ref count value for buffer */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -389,7 +343,8 @@ void /* error */ xfs_btree_reada_bufl( struct xfs_mount *mp, /* file system mount point */ xfs_fsblock_t fsbno, /* file system block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); /* * Read-ahead the block, don't wait for it, don't return a buffer. @@ -400,74 +355,114 @@ xfs_btree_reada_bufs( struct xfs_mount *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* allocation group block number */ - xfs_extlen_t count); /* count of filesystem blocks */ + xfs_extlen_t count, /* count of filesystem blocks */ + const struct xfs_buf_ops *ops); /* - * Read-ahead btree blocks, at the given level. - * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA. + * Initialise a new btree block header */ -int /* readahead block count */ -xfs_btree_readahead_core( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - int lr); /* left/right bits */ +void +xfs_btree_init_block( + struct xfs_mount *mp, + struct xfs_buf *bp, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); -static inline int /* readahead block count */ -xfs_btree_readahead( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - int lr) /* left/right bits */ -{ - if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) - return 0; +void +xfs_btree_init_block_int( + struct xfs_mount *mp, + struct xfs_btree_block *buf, + xfs_daddr_t blkno, + __u32 magic, + __u16 level, + __u16 numrecs, + __u64 owner, + unsigned int flags); - return xfs_btree_readahead_core(cur, lev, lr); -} +/* + * Common btree core entry points. + */ +int xfs_btree_increment(struct xfs_btree_cur *, int, int *); +int xfs_btree_decrement(struct xfs_btree_cur *, int, int *); +int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); +int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); +int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); +int xfs_btree_insert(struct xfs_btree_cur *, int *); +int xfs_btree_delete(struct xfs_btree_cur *, int *); +int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); +int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner, + struct list_head *buffer_list); +/* + * btree block CRC helpers + */ +void xfs_btree_lblock_calc_crc(struct xfs_buf *); +bool xfs_btree_lblock_verify_crc(struct xfs_buf *); +void xfs_btree_sblock_calc_crc(struct xfs_buf *); +bool xfs_btree_sblock_verify_crc(struct xfs_buf *); /* - * Set the buffer for level "lev" in the cursor to bp, releasing - * any previous buffer. + * Internal btree helpers also used by xfs_bmap.c. */ -void -xfs_btree_setbuf( - xfs_btree_cur_t *cur, /* btree cursor */ - int lev, /* level in btree */ - struct xfs_buf *bp); /* new buffer to set */ +void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int); +void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); + +/* + * Helpers. + */ +static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_numrecs); +} + +static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, + __uint16_t numrecs) +{ + block->bb_numrecs = cpu_to_be16(numrecs); +} -#endif /* __KERNEL__ */ +static inline int xfs_btree_get_level(struct xfs_btree_block *block) +{ + return be16_to_cpu(block->bb_level); +} /* * Min and max functions for extlen, agblock, fileoff, and filblks types. */ -#define XFS_EXTLEN_MIN(a,b) \ - ((xfs_extlen_t)(a) < (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_EXTLEN_MAX(a,b) \ - ((xfs_extlen_t)(a) > (xfs_extlen_t)(b) ? \ - (xfs_extlen_t)(a) : (xfs_extlen_t)(b)) -#define XFS_AGBLOCK_MIN(a,b) \ - ((xfs_agblock_t)(a) < (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_AGBLOCK_MAX(a,b) \ - ((xfs_agblock_t)(a) > (xfs_agblock_t)(b) ? \ - (xfs_agblock_t)(a) : (xfs_agblock_t)(b)) -#define XFS_FILEOFF_MIN(a,b) \ - ((xfs_fileoff_t)(a) < (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILEOFF_MAX(a,b) \ - ((xfs_fileoff_t)(a) > (xfs_fileoff_t)(b) ? \ - (xfs_fileoff_t)(a) : (xfs_fileoff_t)(b)) -#define XFS_FILBLKS_MIN(a,b) \ - ((xfs_filblks_t)(a) < (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) -#define XFS_FILBLKS_MAX(a,b) \ - ((xfs_filblks_t)(a) > (xfs_filblks_t)(b) ? \ - (xfs_filblks_t)(a) : (xfs_filblks_t)(b)) +#define XFS_EXTLEN_MIN(a,b) min_t(xfs_extlen_t, (a), (b)) +#define XFS_EXTLEN_MAX(a,b) max_t(xfs_extlen_t, (a), (b)) +#define XFS_AGBLOCK_MIN(a,b) min_t(xfs_agblock_t, (a), (b)) +#define XFS_AGBLOCK_MAX(a,b) max_t(xfs_agblock_t, (a), (b)) +#define XFS_FILEOFF_MIN(a,b) min_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILEOFF_MAX(a,b) max_t(xfs_fileoff_t, (a), (b)) +#define XFS_FILBLKS_MIN(a,b) min_t(xfs_filblks_t, (a), (b)) +#define XFS_FILBLKS_MAX(a,b) max_t(xfs_filblks_t, (a), (b)) #define XFS_FSB_SANITY_CHECK(mp,fsb) \ (XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \ XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks) +/* + * Trace hooks. Currently not implemented as they need to be ported + * over to the generic tracing functionality, which is some effort. + * + * i,j = integer (32 bit) + * b = btree block buffer (xfs_buf_t) + * p = btree ptr + * r = btree record + * k = btree key + */ +#define XFS_BTREE_TRACE_ARGBI(c, b, i) +#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) +#define XFS_BTREE_TRACE_ARGI(c, i) +#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s) +#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) +#define XFS_BTREE_TRACE_ARGIK(c, i, k) +#define XFS_BTREE_TRACE_ARGR(c, r) +#define XFS_BTREE_TRACE_CURSOR(c, t) + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c new file mode 100644 index 00000000000..7a34a1ae655 --- /dev/null +++ b/fs/xfs/xfs_buf.c @@ -0,0 +1,1890 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include <linux/stddef.h> +#include <linux/errno.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/workqueue.h> +#include <linux/percpu.h> +#include <linux/blkdev.h> +#include <linux/hash.h> +#include <linux/kthread.h> +#include <linux/migrate.h> +#include <linux/backing-dev.h> +#include <linux/freezer.h> + +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +static kmem_zone_t *xfs_buf_zone; + +static struct workqueue_struct *xfslogd_workqueue; + +#ifdef XFS_BUF_LOCK_TRACKING +# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) +# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) +# define XB_GET_OWNER(bp) ((bp)->b_last_holder) +#else +# define XB_SET_OWNER(bp) do { } while (0) +# define XB_CLEAR_OWNER(bp) do { } while (0) +# define XB_GET_OWNER(bp) do { } while (0) +#endif + +#define xb_to_gfp(flags) \ + ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) + + +static inline int +xfs_buf_is_vmapped( + struct xfs_buf *bp) +{ + /* + * Return true if the buffer is vmapped. + * + * b_addr is null if the buffer is not mapped, but the code is clever + * enough to know it doesn't have to map a single page, so the check has + * to be both for b_addr and bp->b_page_count > 1. + */ + return bp->b_addr && bp->b_page_count > 1; +} + +static inline int +xfs_buf_vmap_len( + struct xfs_buf *bp) +{ + return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; +} + +/* + * When we mark a buffer stale, we remove the buffer from the LRU and clear the + * b_lru_ref count so that the buffer is freed immediately when the buffer + * reference count falls to zero. If the buffer is already on the LRU, we need + * to remove the reference that LRU holds on the buffer. + * + * This prevents build-up of stale buffers on the LRU. + */ +void +xfs_buf_stale( + struct xfs_buf *bp) +{ + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_STALE; + + /* + * Clear the delwri status so that a delwri queue walker will not + * flush this buffer to disk now that it is stale. The delwri queue has + * a reference to the buffer, so this is safe to do. + */ + bp->b_flags &= ~_XBF_DELWRI_Q; + + spin_lock(&bp->b_lock); + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) + atomic_dec(&bp->b_hold); + + ASSERT(atomic_read(&bp->b_hold) >= 1); + spin_unlock(&bp->b_lock); +} + +static int +xfs_buf_get_maps( + struct xfs_buf *bp, + int map_count) +{ + ASSERT(bp->b_maps == NULL); + bp->b_map_count = map_count; + + if (map_count == 1) { + bp->b_maps = &bp->__b_map; + return 0; + } + + bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), + KM_NOFS); + if (!bp->b_maps) + return ENOMEM; + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +static void +xfs_buf_free_maps( + struct xfs_buf *bp) +{ + if (bp->b_maps != &bp->__b_map) { + kmem_free(bp->b_maps); + bp->b_maps = NULL; + } +} + +struct xfs_buf * +_xfs_buf_alloc( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + int i; + + bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); + if (unlikely(!bp)) + return NULL; + + /* + * We don't want certain flags to appear in b_flags unless they are + * specifically set by later operations on the buffer. + */ + flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + + atomic_set(&bp->b_hold, 1); + atomic_set(&bp->b_lru_ref, 1); + init_completion(&bp->b_iowait); + INIT_LIST_HEAD(&bp->b_lru); + INIT_LIST_HEAD(&bp->b_list); + RB_CLEAR_NODE(&bp->b_rbnode); + sema_init(&bp->b_sema, 0); /* held, no waiters */ + spin_lock_init(&bp->b_lock); + XB_SET_OWNER(bp); + bp->b_target = target; + bp->b_flags = flags; + + /* + * Set length and io_length to the same value initially. + * I/O routines should use io_length, which will be the same in + * most cases but may be reset (e.g. XFS recovery). + */ + error = xfs_buf_get_maps(bp, nmaps); + if (error) { + kmem_zone_free(xfs_buf_zone, bp); + return NULL; + } + + bp->b_bn = map[0].bm_bn; + bp->b_length = 0; + for (i = 0; i < nmaps; i++) { + bp->b_maps[i].bm_bn = map[i].bm_bn; + bp->b_maps[i].bm_len = map[i].bm_len; + bp->b_length += map[i].bm_len; + } + bp->b_io_length = bp->b_length; + + atomic_set(&bp->b_pin_count, 0); + init_waitqueue_head(&bp->b_waiters); + + XFS_STATS_INC(xb_create); + trace_xfs_buf_init(bp, _RET_IP_); + + return bp; +} + +/* + * Allocate a page array capable of holding a specified number + * of pages, and point the page buf at it. + */ +STATIC int +_xfs_buf_get_pages( + xfs_buf_t *bp, + int page_count) +{ + /* Make sure that we have a page list */ + if (bp->b_pages == NULL) { + bp->b_page_count = page_count; + if (page_count <= XB_PAGES) { + bp->b_pages = bp->b_page_array; + } else { + bp->b_pages = kmem_alloc(sizeof(struct page *) * + page_count, KM_NOFS); + if (bp->b_pages == NULL) + return -ENOMEM; + } + memset(bp->b_pages, 0, sizeof(struct page *) * page_count); + } + return 0; +} + +/* + * Frees b_pages if it was allocated. + */ +STATIC void +_xfs_buf_free_pages( + xfs_buf_t *bp) +{ + if (bp->b_pages != bp->b_page_array) { + kmem_free(bp->b_pages); + bp->b_pages = NULL; + } +} + +/* + * Releases the specified buffer. + * + * The modification state of any associated pages is left unchanged. + * The buffer must not be on any hash - use xfs_buf_rele instead for + * hashed and refcounted buffers + */ +void +xfs_buf_free( + xfs_buf_t *bp) +{ + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (bp->b_flags & _XBF_PAGES) { + uint i; + + if (xfs_buf_is_vmapped(bp)) + vm_unmap_ram(bp->b_addr - bp->b_offset, + bp->b_page_count); + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page = bp->b_pages[i]; + + __free_page(page); + } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); + _xfs_buf_free_pages(bp); + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); +} + +/* + * Allocates all the pages for buffer in question and builds it's page list. + */ +STATIC int +xfs_buf_allocate_memory( + xfs_buf_t *bp, + uint flags) +{ + size_t size; + size_t nbytes, offset; + gfp_t gfp_mask = xb_to_gfp(flags); + unsigned short page_count, i; + xfs_off_t start, end; + int error; + + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + size = BBTOB(bp->b_length); + if (size < PAGE_SIZE) { + bp->b_addr = kmem_alloc(size, KM_NOFS); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= _XBF_KMEM; + return 0; + } + +use_alloc_page: + start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; + end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) + >> PAGE_SHIFT; + page_count = end - start; + error = _xfs_buf_get_pages(bp, page_count); + if (unlikely(error)) + return error; + + offset = bp->b_offset; + bp->b_flags |= _XBF_PAGES; + + for (i = 0; i < bp->b_page_count; i++) { + struct page *page; + uint retries = 0; +retry: + page = alloc_page(gfp_mask); + if (unlikely(page == NULL)) { + if (flags & XBF_READ_AHEAD) { + bp->b_page_count = i; + error = ENOMEM; + goto out_free_pages; + } + + /* + * This could deadlock. + * + * But until all the XFS lowlevel code is revamped to + * handle buffer allocation failures we can't do much. + */ + if (!(++retries % 100)) + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", + __func__, gfp_mask); + + XFS_STATS_INC(xb_page_retries); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + + XFS_STATS_INC(xb_page_found); + + nbytes = min_t(size_t, size, PAGE_SIZE - offset); + size -= nbytes; + bp->b_pages[i] = page; + offset = 0; + } + return 0; + +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); + return error; +} + +/* + * Map buffer into kernel address-space if necessary. + */ +STATIC int +_xfs_buf_map_pages( + xfs_buf_t *bp, + uint flags) +{ + ASSERT(bp->b_flags & _XBF_PAGES); + if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ + bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; + } else if (flags & XBF_UNMAPPED) { + bp->b_addr = NULL; + } else { + int retried = 0; + unsigned noio_flag; + + /* + * vm_map_ram() will allocate auxillary structures (e.g. + * pagetables) with GFP_KERNEL, yet we are likely to be under + * GFP_NOFS context here. Hence we need to tell memory reclaim + * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * memory reclaim re-entering the filesystem here and + * potentially deadlocking. + */ + noio_flag = memalloc_noio_save(); + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + memalloc_noio_restore(noio_flag); + + if (!bp->b_addr) + return -ENOMEM; + bp->b_addr += bp->b_offset; + } + + return 0; +} + +/* + * Finding and Reading Buffers + */ + +/* + * Look up, and creates if absent, a lockable buffer for + * a given range of an inode. The buffer is returned + * locked. No I/O is implied by this call. + */ +xfs_buf_t * +_xfs_buf_find( + struct xfs_buftarg *btp, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + xfs_buf_t *new_bp) +{ + size_t numbytes; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; + xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; + int numblks = 0; + int i; + + for (i = 0; i < nmaps; i++) + numblks += map[i].bm_len; + numbytes = BBTOB(numblks); + + /* Check for IOs smaller than the sector size / not sector aligned */ + ASSERT(!(numbytes < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + WARN_ON(1); + return NULL; + } + + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, blkno)); + + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); + + if (blkno < bp->b_bn) + rbp = &(*rbp)->rb_left; + else if (blkno > bp->b_bn) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_length != numblks) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } + atomic_inc(&bp->b_hold); + goto found; + } + } + + /* No match found */ + if (new_bp) { + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); + } else { + XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + } + return new_bp; + +found: + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) { + xfs_buf_rele(bp); + XFS_STATS_INC(xb_busy_locked); + return NULL; + } + xfs_buf_lock(bp); + XFS_STATS_INC(xb_get_locked_waited); + } + + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ + if (bp->b_flags & XBF_STALE) { + ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); + ASSERT(bp->b_iodone == NULL); + bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_ops = NULL; + } + + trace_xfs_buf_find(bp, flags, _RET_IP_); + XFS_STATS_INC(xb_get_locked); + return bp; +} + +/* + * Assembles a buffer covering the specified range. The code is optimised for + * cache hits, as metadata intensive workloads will see 3 orders of magnitude + * more hits than misses. + */ +struct xfs_buf * +xfs_buf_get_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + struct xfs_buf *new_bp; + int error = 0; + + bp = _xfs_buf_find(target, map, nmaps, flags, NULL); + if (likely(bp)) + goto found; + + new_bp = _xfs_buf_alloc(target, map, nmaps, flags); + if (unlikely(!new_bp)) + return NULL; + + error = xfs_buf_allocate_memory(new_bp, flags); + if (error) { + xfs_buf_free(new_bp); + return NULL; + } + + bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); + if (!bp) { + xfs_buf_free(new_bp); + return NULL; + } + + if (bp != new_bp) + xfs_buf_free(new_bp); + +found: + if (!bp->b_addr) { + error = _xfs_buf_map_pages(bp, flags); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pagesn", __func__); + xfs_buf_relse(bp); + return NULL; + } + } + + XFS_STATS_INC(xb_get); + trace_xfs_buf_get(bp, flags, _RET_IP_); + return bp; +} + +STATIC int +_xfs_buf_read( + xfs_buf_t *bp, + xfs_buf_flags_t flags) +{ + ASSERT(!(flags & XBF_WRITE)); + ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); + + bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); + bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); + + xfs_buf_iorequest(bp); + if (flags & XBF_ASYNC) + return 0; + return xfs_buf_iowait(bp); +} + +xfs_buf_t * +xfs_buf_read_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + flags |= XBF_READ; + + bp = xfs_buf_get_map(target, map, nmaps, flags); + if (bp) { + trace_xfs_buf_read(bp, flags, _RET_IP_); + + if (!XFS_BUF_ISDONE(bp)) { + XFS_STATS_INC(xb_get_read); + bp->b_ops = ops; + _xfs_buf_read(bp, flags); + } else if (flags & XBF_ASYNC) { + /* + * Read ahead call which is already satisfied, + * drop the buffer + */ + xfs_buf_relse(bp); + return NULL; + } else { + /* We do not want read in the flags */ + bp->b_flags &= ~XBF_READ; + } + } + + return bp; +} + +/* + * If we are not low on memory then do the readahead in a deadlock + * safe manner. + */ +void +xfs_buf_readahead_map( + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + const struct xfs_buf_ops *ops) +{ + if (bdi_read_congested(target->bt_bdi)) + return; + + xfs_buf_read_map(target, map, nmaps, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); +} + +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +struct xfs_buf * +xfs_buf_read_uncached( + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(target, numblks, flags); + if (!bp) + return NULL; + + /* set up the buffer for a read IO */ + ASSERT(bp->b_map_count == 1); + bp->b_bn = daddr; + bp->b_maps[0].bm_bn = daddr; + bp->b_flags |= XBF_READ; + bp->b_ops = ops; + + if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_relse(bp); + return NULL; + } + xfs_buf_iorequest(bp); + xfs_buf_iowait(bp); + return bp; +} + +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t numblks) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_length = numblks; + bp->b_io_length = numblks; + + ASSERT(bp->b_map_count == 1); + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; + bp->b_maps[0].bm_len = bp->b_length; +} + +static inline struct page * +mem_to_page( + void *addr) +{ + if ((!is_vmalloc_addr(addr))) { + return virt_to_page(addr); + } else { + return vmalloc_to_page(addr); + } +} + +int +xfs_buf_associate_memory( + xfs_buf_t *bp, + void *mem, + size_t len) +{ + int rval; + int i = 0; + unsigned long pageaddr; + unsigned long offset; + size_t buflen; + int page_count; + + pageaddr = (unsigned long)mem & PAGE_MASK; + offset = (unsigned long)mem - pageaddr; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; + + /* Free any previous set of page pointers */ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_addr = mem; + + rval = _xfs_buf_get_pages(bp, page_count); + if (rval) + return rval; + + bp->b_offset = offset; + + for (i = 0; i < bp->b_page_count; i++) { + bp->b_pages[i] = mem_to_page((void *)pageaddr); + pageaddr += PAGE_SIZE; + } + + bp->b_io_length = BTOBB(len); + bp->b_length = BTOBB(buflen); + + return 0; +} + +xfs_buf_t * +xfs_buf_get_uncached( + struct xfs_buftarg *target, + size_t numblks, + int flags) +{ + unsigned long page_count; + int error, i; + struct xfs_buf *bp; + DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); + + bp = _xfs_buf_alloc(target, &map, 1, 0); + if (unlikely(bp == NULL)) + goto fail; + + page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; + error = _xfs_buf_get_pages(bp, page_count); + if (error) + goto fail_free_buf; + + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); + if (!bp->b_pages[i]) + goto fail_free_mem; + } + bp->b_flags |= _XBF_PAGES; + + error = _xfs_buf_map_pages(bp, 0); + if (unlikely(error)) { + xfs_warn(target->bt_mount, + "%s: failed to map pages", __func__); + goto fail_free_mem; + } + + trace_xfs_buf_get_uncached(bp, _RET_IP_); + return bp; + + fail_free_mem: + while (--i >= 0) + __free_page(bp->b_pages[i]); + _xfs_buf_free_pages(bp); + fail_free_buf: + xfs_buf_free_maps(bp); + kmem_zone_free(xfs_buf_zone, bp); + fail: + return NULL; +} + +/* + * Increment reference count on buffer, to hold the buffer concurrently + * with another thread which may release (free) the buffer asynchronously. + * Must hold the buffer already to call this function. + */ +void +xfs_buf_hold( + xfs_buf_t *bp) +{ + trace_xfs_buf_hold(bp, _RET_IP_); + atomic_inc(&bp->b_hold); +} + +/* + * Releases a hold on the specified buffer. If the + * the hold count is 1, calls xfs_buf_free. + */ +void +xfs_buf_rele( + xfs_buf_t *bp) +{ + struct xfs_perag *pag = bp->b_pag; + + trace_xfs_buf_rele(bp, _RET_IP_); + + if (!pag) { + ASSERT(list_empty(&bp->b_lru)); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); + + ASSERT(atomic_read(&bp->b_hold) > 0); + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { + spin_lock(&bp->b_lock); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new + * reference to the buffer for the LRU and clear the + * (now stale) dispose list state flag + */ + if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { + bp->b_state &= ~XFS_BSTATE_DISPOSE; + atomic_inc(&bp->b_hold); + } + spin_unlock(&bp->b_lock); + spin_unlock(&pag->pag_buf_lock); + } else { + /* + * most of the time buffers will already be removed from + * the LRU, so optimise that case by checking for the + * XFS_BSTATE_DISPOSE flag indicating the last list the + * buffer was on was the disposal list + */ + if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { + list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); + } else { + ASSERT(list_empty(&bp->b_lru)); + } + spin_unlock(&bp->b_lock); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); + xfs_buf_free(bp); + } + } +} + + +/* + * Lock a buffer object, if it is not already locked. + * + * If we come across a stale, pinned, locked buffer, we know that we are + * being asked to lock a buffer that has been reallocated. Because it is + * pinned, we know that the log has not been pushed to disk and hence it + * will still be locked. Rather than continuing to have trylock attempts + * fail until someone else pushes the log, push it ourselves before + * returning. This means that the xfsaild will not get stuck trying + * to push on stale inode buffers. + */ +int +xfs_buf_trylock( + struct xfs_buf *bp) +{ + int locked; + + locked = down_trylock(&bp->b_sema) == 0; + if (locked) + XB_SET_OWNER(bp); + + trace_xfs_buf_trylock(bp, _RET_IP_); + return locked; +} + +/* + * Lock a buffer object. + * + * If we come across a stale, pinned, locked buffer, we know that we + * are being asked to lock a buffer that has been reallocated. Because + * it is pinned, we know that the log has not been pushed to disk and + * hence it will still be locked. Rather than sleeping until someone + * else pushes the log, push it ourselves before trying to get the lock. + */ +void +xfs_buf_lock( + struct xfs_buf *bp) +{ + trace_xfs_buf_lock(bp, _RET_IP_); + + if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) + xfs_log_force(bp->b_target->bt_mount, 0); + down(&bp->b_sema); + XB_SET_OWNER(bp); + + trace_xfs_buf_lock_done(bp, _RET_IP_); +} + +void +xfs_buf_unlock( + struct xfs_buf *bp) +{ + XB_CLEAR_OWNER(bp); + up(&bp->b_sema); + + trace_xfs_buf_unlock(bp, _RET_IP_); +} + +STATIC void +xfs_buf_wait_unpin( + xfs_buf_t *bp) +{ + DECLARE_WAITQUEUE (wait, current); + + if (atomic_read(&bp->b_pin_count) == 0) + return; + + add_wait_queue(&bp->b_waiters, &wait); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&bp->b_pin_count) == 0) + break; + io_schedule(); + } + remove_wait_queue(&bp->b_waiters, &wait); + set_current_state(TASK_RUNNING); +} + +/* + * Buffer Utility Routines + */ + +STATIC void +xfs_buf_iodone_work( + struct work_struct *work) +{ + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_iodone_work); + bool read = !!(bp->b_flags & XBF_READ); + + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + + /* only validate buffers that were read without errors */ + if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) + bp->b_ops->verify_read(bp); + + if (bp->b_iodone) + (*(bp->b_iodone))(bp); + else if (bp->b_flags & XBF_ASYNC) + xfs_buf_relse(bp); + else { + ASSERT(read && bp->b_ops); + complete(&bp->b_iowait); + } +} + +void +xfs_buf_ioend( + struct xfs_buf *bp, + int schedule) +{ + bool read = !!(bp->b_flags & XBF_READ); + + trace_xfs_buf_iodone(bp, _RET_IP_); + + if (bp->b_error == 0) + bp->b_flags |= XBF_DONE; + + if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { + if (schedule) { + INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); + } else { + xfs_buf_iodone_work(&bp->b_iodone_work); + } + } else { + bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); + complete(&bp->b_iowait); + } +} + +void +xfs_buf_ioerror( + xfs_buf_t *bp, + int error) +{ + ASSERT(error >= 0 && error <= 0xffff); + bp->b_error = (unsigned short)error; + trace_xfs_buf_ioerror(bp, error, _RET_IP_); +} + +void +xfs_buf_ioerror_alert( + struct xfs_buf *bp, + const char *func) +{ + xfs_alert(bp->b_target->bt_mount, +"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", + (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); +} + +/* + * Called when we want to stop a buffer from getting written or read. + * We attach the EIO error, muck with its flags, and call xfs_buf_ioend + * so that the proper iodone callbacks get called. + */ +STATIC int +xfs_bioerror( + xfs_buf_t *bp) +{ +#ifdef XFSERRORDEBUG + ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); +#endif + + /* + * No need to wait until the buffer is unpinned, we aren't flushing it. + */ + xfs_buf_ioerror(bp, EIO); + + /* + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + + xfs_buf_ioend(bp, 0); + + return EIO; +} + +/* + * Same as xfs_bioerror, except that we are releasing the buffer + * here ourselves, and avoiding the xfs_buf_ioend call. + * This is meant for userdata errors; metadata bufs come with + * iodone functions attached, so that we can track down errors. + */ +int +xfs_bioerror_relse( + struct xfs_buf *bp) +{ + int64_t fl = bp->b_flags; + /* + * No need to wait until the buffer is unpinned. + * We aren't flushing it. + * + * chunkhold expects B_DONE to be set, whether + * we actually finish the I/O or not. We don't want to + * change that interface. + */ + XFS_BUF_UNREAD(bp); + XFS_BUF_DONE(bp); + xfs_buf_stale(bp); + bp->b_iodone = NULL; + if (!(fl & XBF_ASYNC)) { + /* + * Mark b_error and B_ERROR _both_. + * Lot's of chunkcache code assumes that. + * There's no reason to mark error for + * ASYNC buffers. + */ + xfs_buf_ioerror(bp, EIO); + complete(&bp->b_iowait); + } else { + xfs_buf_relse(bp); + } + + return EIO; +} + +STATIC int +xfs_bdstrat_cb( + struct xfs_buf *bp) +{ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + /* + * Metadata write that didn't get logged but + * written delayed anyway. These aren't associated + * with a transaction, and can be ignored. + */ + if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) + return xfs_bioerror_relse(bp); + else + return xfs_bioerror(bp); + } + + xfs_buf_iorequest(bp); + return 0; +} + +int +xfs_bwrite( + struct xfs_buf *bp) +{ + int error; + + ASSERT(xfs_buf_islocked(bp)); + + bp->b_flags |= XBF_WRITE; + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); + + xfs_bdstrat_cb(bp); + + error = xfs_buf_iowait(bp); + if (error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } + return error; +} + +STATIC void +_xfs_buf_ioend( + xfs_buf_t *bp, + int schedule) +{ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp, schedule); +} + +STATIC void +xfs_buf_bio_end_io( + struct bio *bio, + int error) +{ + xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; + + /* + * don't overwrite existing errors - otherwise we can lose errors on + * buffers that require multiple bios to complete. + */ + if (!bp->b_error) + xfs_buf_ioerror(bp, -error); + + if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) + invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); + + _xfs_buf_ioend(bp, 1); + bio_put(bio); +} + +static void +xfs_buf_ioapply_map( + struct xfs_buf *bp, + int map, + int *buf_offset, + int *count, + int rw) +{ + int page_index; + int total_nr_pages = bp->b_page_count; + int nr_pages; + struct bio *bio; + sector_t sector = bp->b_maps[map].bm_bn; + int size; + int offset; + + total_nr_pages = bp->b_page_count; + + /* skip the pages in the buffer before the start offset */ + page_index = 0; + offset = *buf_offset; + while (offset >= PAGE_SIZE) { + page_index++; + offset -= PAGE_SIZE; + } + + /* + * Limit the IO size to the length of the current vector, and update the + * remaining IO count for the next time around. + */ + size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); + *count -= size; + *buf_offset += size; + +next_chunk: + atomic_inc(&bp->b_io_remaining); + nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); + if (nr_pages > total_nr_pages) + nr_pages = total_nr_pages; + + bio = bio_alloc(GFP_NOIO, nr_pages); + bio->bi_bdev = bp->b_target->bt_bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_end_io = xfs_buf_bio_end_io; + bio->bi_private = bp; + + + for (; size && nr_pages; nr_pages--, page_index++) { + int rbytes, nbytes = PAGE_SIZE - offset; + + if (nbytes > size) + nbytes = size; + + rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, + offset); + if (rbytes < nbytes) + break; + + offset = 0; + sector += BTOBB(nbytes); + size -= nbytes; + total_nr_pages--; + } + + if (likely(bio->bi_iter.bi_size)) { + if (xfs_buf_is_vmapped(bp)) { + flush_kernel_vmap_range(bp->b_addr, + xfs_buf_vmap_len(bp)); + } + submit_bio(rw, bio); + if (size) + goto next_chunk; + } else { + /* + * This is guaranteed not to be the last io reference count + * because the caller (xfs_buf_iorequest) holds a count itself. + */ + atomic_dec(&bp->b_io_remaining); + xfs_buf_ioerror(bp, EIO); + bio_put(bio); + } + +} + +STATIC void +_xfs_buf_ioapply( + struct xfs_buf *bp) +{ + struct blk_plug plug; + int rw; + int offset; + int size; + int i; + + /* + * Make sure we capture only current IO errors rather than stale errors + * left over from previous use of the buffer (e.g. failed readahead). + */ + bp->b_error = 0; + + if (bp->b_flags & XBF_WRITE) { + if (bp->b_flags & XBF_SYNCIO) + rw = WRITE_SYNC; + else + rw = WRITE; + if (bp->b_flags & XBF_FUA) + rw |= REQ_FUA; + if (bp->b_flags & XBF_FLUSH) + rw |= REQ_FLUSH; + + /* + * Run the write verifier callback function if it exists. If + * this function fails it will mark the buffer with an error and + * the IO should not be dispatched. + */ + if (bp->b_ops) { + bp->b_ops->verify_write(bp); + if (bp->b_error) { + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_CORRUPT_INCORE); + return; + } + } + } else if (bp->b_flags & XBF_READ_AHEAD) { + rw = READA; + } else { + rw = READ; + } + + /* we only use the buffer cache for meta-data */ + rw |= REQ_META; + + /* + * Walk all the vectors issuing IO on them. Set up the initial offset + * into the buffer and the desired IO size before we start - + * _xfs_buf_ioapply_vec() will modify them appropriately for each + * subsequent call. + */ + offset = bp->b_offset; + size = BBTOB(bp->b_io_length); + blk_start_plug(&plug); + for (i = 0; i < bp->b_map_count; i++) { + xfs_buf_ioapply_map(bp, i, &offset, &size, rw); + if (bp->b_error) + break; + if (size <= 0) + break; /* all done */ + } + blk_finish_plug(&plug); +} + +void +xfs_buf_iorequest( + xfs_buf_t *bp) +{ + trace_xfs_buf_iorequest(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O + * completion callout which happens before we have started + * all the I/O from calling xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + /* + * If _xfs_buf_ioapply failed, we'll get back here with + * only the reference we took above. _xfs_buf_ioend will + * drop it to zero, so we'd better not queue it for later, + * or we'll free it before it's done. + */ + _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); + + xfs_buf_rele(bp); +} + +/* + * Waits for I/O to complete on the buffer supplied. It returns immediately if + * no I/O is pending or there is already a pending error on the buffer, in which + * case nothing will ever complete. It returns the I/O error code, if any, or + * 0 if there was no error. + */ +int +xfs_buf_iowait( + xfs_buf_t *bp) +{ + trace_xfs_buf_iowait(bp, _RET_IP_); + + if (!bp->b_error) + wait_for_completion(&bp->b_iowait); + + trace_xfs_buf_iowait_done(bp, _RET_IP_); + return bp->b_error; +} + +xfs_caddr_t +xfs_buf_offset( + xfs_buf_t *bp, + size_t offset) +{ + struct page *page; + + if (bp->b_addr) + return bp->b_addr + offset; + + offset += bp->b_offset; + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); +} + +/* + * Move data into or out of a buffer. + */ +void +xfs_buf_iomove( + xfs_buf_t *bp, /* buffer to process */ + size_t boff, /* starting buffer offset */ + size_t bsize, /* length to copy */ + void *data, /* data address */ + xfs_buf_rw_t mode) /* read/write/zero flag */ +{ + size_t bend; + + bend = boff + bsize; + while (boff < bend) { + struct page *page; + int page_index, page_offset, csize; + + page_index = (boff + bp->b_offset) >> PAGE_SHIFT; + page_offset = (boff + bp->b_offset) & ~PAGE_MASK; + page = bp->b_pages[page_index]; + csize = min_t(size_t, PAGE_SIZE - page_offset, + BBTOB(bp->b_io_length) - boff); + + ASSERT((csize + page_offset) <= PAGE_SIZE); + + switch (mode) { + case XBRW_ZERO: + memset(page_address(page) + page_offset, 0, csize); + break; + case XBRW_READ: + memcpy(data, page_address(page) + page_offset, csize); + break; + case XBRW_WRITE: + memcpy(page_address(page) + page_offset, data, csize); + } + + boff += csize; + data += csize; + } +} + +/* + * Handling of buffer targets (buftargs). + */ + +/* + * Wait for any bufs with callbacks that have been submitted but have not yet + * returned. These buffers will have an elevated hold count, so wait on those + * while freeing all the buffers only held by the LRU. + */ +static enum lru_status +xfs_buftarg_wait_rele( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) + +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + if (atomic_read(&bp->b_hold) > 1) { + /* need to wait, so skip it this pass */ + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + return LRU_SKIP; + } + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + + /* + * clear the LRU reference count so the buffer doesn't get + * ignored in xfs_buf_rele(). + */ + atomic_set(&bp->b_lru_ref, 0); + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +void +xfs_wait_buftarg( + struct xfs_buftarg *btp) +{ + LIST_HEAD(dispose); + int loop = 0; + + /* loop until there is nothing left on the lru list. */ + while (list_lru_count(&btp->bt_lru)) { + list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, + &dispose, LONG_MAX); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + if (bp->b_flags & XBF_WRITE_FAIL) { + xfs_alert(btp->bt_mount, +"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n" +"Please run xfs_repair to determine the extent of the problem.", + (long long)bp->b_bn); + } + xfs_buf_rele(bp); + } + if (loop++ != 0) + delay(100); + } +} + +static enum lru_status +xfs_buftarg_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); + struct list_head *dispose = arg; + + /* + * we are inverting the lru lock/bp->b_lock here, so use a trylock. + * If we fail to get the lock, just skip it. + */ + if (!spin_trylock(&bp->b_lock)) + return LRU_SKIP; + /* + * Decrement the b_lru_ref count unless the value is already + * zero. If the value is already zero, we need to reclaim the + * buffer, otherwise it gets another trip through the LRU. + */ + if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { + spin_unlock(&bp->b_lock); + return LRU_ROTATE; + } + + bp->b_state |= XFS_BSTATE_DISPOSE; + list_move(item, dispose); + spin_unlock(&bp->b_lock); + return LRU_REMOVED; +} + +static unsigned long +xfs_buftarg_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + LIST_HEAD(dispose); + unsigned long freed; + unsigned long nr_to_scan = sc->nr_to_scan; + + freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, + &dispose, &nr_to_scan); + + while (!list_empty(&dispose)) { + struct xfs_buf *bp; + bp = list_first_entry(&dispose, struct xfs_buf, b_lru); + list_del_init(&bp->b_lru); + xfs_buf_rele(bp); + } + + return freed; +} + +static unsigned long +xfs_buftarg_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_buftarg *btp = container_of(shrink, + struct xfs_buftarg, bt_shrinker); + return list_lru_count_node(&btp->bt_lru, sc->nid); +} + +void +xfs_free_buftarg( + struct xfs_mount *mp, + struct xfs_buftarg *btp) +{ + unregister_shrinker(&btp->bt_shrinker); + list_lru_destroy(&btp->bt_lru); + + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); + + kmem_free(btp); +} + +int +xfs_setsize_buftarg( + xfs_buftarg_t *btp, + unsigned int sectorsize) +{ + /* Set up metadata sector size info */ + btp->bt_meta_sectorsize = sectorsize; + btp->bt_meta_sectormask = sectorsize - 1; + + if (set_blocksize(btp->bt_bdev, sectorsize)) { + char name[BDEVNAME_SIZE]; + + bdevname(btp->bt_bdev, name); + + xfs_warn(btp->bt_mount, + "Cannot set_blocksize to %u on device %s", + sectorsize, name); + return EINVAL; + } + + /* Set up device logical sector size mask */ + btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + + return 0; +} + +/* + * When allocating the initial buffer target we have not yet + * read in the superblock, so don't know what sized sectors + * are being used at this early stage. Play safe. + */ +STATIC int +xfs_setsize_buftarg_early( + xfs_buftarg_t *btp, + struct block_device *bdev) +{ + return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); +} + +xfs_buftarg_t * +xfs_alloc_buftarg( + struct xfs_mount *mp, + struct block_device *bdev) +{ + xfs_buftarg_t *btp; + + btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); + + btp->bt_mount = mp; + btp->bt_dev = bdev->bd_dev; + btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + if (!btp->bt_bdi) + goto error; + + if (xfs_setsize_buftarg_early(btp, bdev)) + goto error; + + if (list_lru_init(&btp->bt_lru)) + goto error; + + btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker.seeks = DEFAULT_SEEKS; + btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&btp->bt_shrinker); + return btp; + +error: + kmem_free(btp); + return NULL; +} + +/* + * Add a buffer to the delayed write list. + * + * This queues a buffer for writeout if it hasn't already been. Note that + * neither this routine nor the buffer list submission functions perform + * any internal synchronization. It is expected that the lists are thread-local + * to the callers. + * + * Returns true if we queued up the buffer, or false if it already had + * been on the buffer list. + */ +bool +xfs_buf_delwri_queue( + struct xfs_buf *bp, + struct list_head *list) +{ + ASSERT(xfs_buf_islocked(bp)); + ASSERT(!(bp->b_flags & XBF_READ)); + + /* + * If the buffer is already marked delwri it already is queued up + * by someone else for imediate writeout. Just ignore it in that + * case. + */ + if (bp->b_flags & _XBF_DELWRI_Q) { + trace_xfs_buf_delwri_queued(bp, _RET_IP_); + return false; + } + + trace_xfs_buf_delwri_queue(bp, _RET_IP_); + + /* + * If a buffer gets written out synchronously or marked stale while it + * is on a delwri list we lazily remove it. To do this, the other party + * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. + * It remains referenced and on the list. In a rare corner case it + * might get readded to a delwri list after the synchronous writeout, in + * which case we need just need to re-add the flag here. + */ + bp->b_flags |= _XBF_DELWRI_Q; + if (list_empty(&bp->b_list)) { + atomic_inc(&bp->b_hold); + list_add_tail(&bp->b_list, list); + } + + return true; +} + +/* + * Compare function is more complex than it needs to be because + * the return value is only 32 bits and we are doing comparisons + * on 64 bit values + */ +static int +xfs_buf_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); + struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); + xfs_daddr_t diff; + + diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; + if (diff < 0) + return -1; + if (diff > 0) + return 1; + return 0; +} + +static int +__xfs_buf_delwri_submit( + struct list_head *buffer_list, + struct list_head *io_list, + bool wait) +{ + struct blk_plug plug; + struct xfs_buf *bp, *n; + int pinned = 0; + + list_for_each_entry_safe(bp, n, buffer_list, b_list) { + if (!wait) { + if (xfs_buf_ispinned(bp)) { + pinned++; + continue; + } + if (!xfs_buf_trylock(bp)) + continue; + } else { + xfs_buf_lock(bp); + } + + /* + * Someone else might have written the buffer synchronously or + * marked it stale in the meantime. In that case only the + * _XBF_DELWRI_Q flag got cleared, and we have to drop the + * reference and remove it from the list here. + */ + if (!(bp->b_flags & _XBF_DELWRI_Q)) { + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + continue; + } + + list_move_tail(&bp->b_list, io_list); + trace_xfs_buf_delwri_split(bp, _RET_IP_); + } + + list_sort(NULL, io_list, xfs_buf_cmp); + + blk_start_plug(&plug); + list_for_each_entry_safe(bp, n, io_list, b_list) { + bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); + bp->b_flags |= XBF_WRITE; + + if (!wait) { + bp->b_flags |= XBF_ASYNC; + list_del_init(&bp->b_list); + } + xfs_bdstrat_cb(bp); + } + blk_finish_plug(&plug); + + return pinned; +} + +/* + * Write out a buffer list asynchronously. + * + * This will take the @buffer_list, write all non-locked and non-pinned buffers + * out and not wait for I/O completion on any of the buffers. This interface + * is only safely useable for callers that can track I/O completion by higher + * level means, e.g. AIL pushing as the @buffer_list is consumed in this + * function. + */ +int +xfs_buf_delwri_submit_nowait( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + return __xfs_buf_delwri_submit(buffer_list, &io_list, false); +} + +/* + * Write out a buffer list synchronously. + * + * This will take the @buffer_list, write all buffers out and wait for I/O + * completion on all of the buffers. @buffer_list is consumed by the function, + * so callers must have some other way of tracking buffers if they require such + * functionality. + */ +int +xfs_buf_delwri_submit( + struct list_head *buffer_list) +{ + LIST_HEAD (io_list); + int error = 0, error2; + struct xfs_buf *bp; + + __xfs_buf_delwri_submit(buffer_list, &io_list, true); + + /* Wait for IO to complete. */ + while (!list_empty(&io_list)) { + bp = list_first_entry(&io_list, struct xfs_buf, b_list); + + list_del_init(&bp->b_list); + error2 = xfs_buf_iowait(bp); + xfs_buf_relse(bp); + if (!error) + error = error2; + } + + return error; +} + +int __init +xfs_buf_init(void) +{ + xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", + KM_ZONE_HWALIGN, NULL); + if (!xfs_buf_zone) + goto out; + + xfslogd_workqueue = alloc_workqueue("xfslogd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); + if (!xfslogd_workqueue) + goto out_free_buf_zone; + + return 0; + + out_free_buf_zone: + kmem_zone_destroy(xfs_buf_zone); + out: + return -ENOMEM; +} + +void +xfs_buf_terminate(void) +{ + destroy_workqueue(xfslogd_workqueue); + kmem_zone_destroy(xfs_buf_zone); +} diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h new file mode 100644 index 00000000000..3a7a5523d3d --- /dev/null +++ b/fs/xfs/xfs_buf.h @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_BUF_H__ +#define __XFS_BUF_H__ + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/uio.h> +#include <linux/list_lru.h> + +/* + * Base types + */ + +#define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) + +typedef enum { + XBRW_READ = 1, /* transfer into target memory */ + XBRW_WRITE = 2, /* transfer from target memory */ + XBRW_ZERO = 3, /* Zero target memory */ +} xfs_buf_rw_t; + +#define XBF_READ (1 << 0) /* buffer intended for reading from device */ +#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ +#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ +#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ +#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ +#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ +#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */ + +/* I/O hints for the BIO layer */ +#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ +#define XBF_FUA (1 << 11)/* force cache write through mode */ +#define XBF_FLUSH (1 << 12)/* flush the disk cache before a write */ + +/* flags used only as arguments to access routines */ +#define XBF_TRYLOCK (1 << 16)/* lock requested, but do not wait */ +#define XBF_UNMAPPED (1 << 17)/* do not map the buffer */ + +/* flags used only internally */ +#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ +#define _XBF_KMEM (1 << 21)/* backed by heap memory */ +#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ +#define _XBF_COMPOUND (1 << 23)/* compound buffer */ + +typedef unsigned int xfs_buf_flags_t; + +#define XFS_BUF_FLAGS \ + { XBF_READ, "READ" }, \ + { XBF_WRITE, "WRITE" }, \ + { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_ASYNC, "ASYNC" }, \ + { XBF_DONE, "DONE" }, \ + { XBF_STALE, "STALE" }, \ + { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ + { XBF_SYNCIO, "SYNCIO" }, \ + { XBF_FUA, "FUA" }, \ + { XBF_FLUSH, "FLUSH" }, \ + { XBF_TRYLOCK, "TRYLOCK" }, /* should never be set */\ + { XBF_UNMAPPED, "UNMAPPED" }, /* ditto */\ + { _XBF_PAGES, "PAGES" }, \ + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ + { _XBF_COMPOUND, "COMPOUND" } + + +/* + * Internal state flags. + */ +#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ + +/* + * The xfs_buftarg contains 2 notions of "sector size" - + * + * 1) The metadata sector size, which is the minimum unit and + * alignment of IO which will be performed by metadata operations. + * 2) The device logical sector size + * + * The first is specified at mkfs time, and is stored on-disk in the + * superblock's sb_sectsize. + * + * The latter is derived from the underlying device, and controls direct IO + * alignment constraints. + */ +typedef struct xfs_buftarg { + dev_t bt_dev; + struct block_device *bt_bdev; + struct backing_dev_info *bt_bdi; + struct xfs_mount *bt_mount; + unsigned int bt_meta_sectorsize; + size_t bt_meta_sectormask; + size_t bt_logical_sectorsize; + size_t bt_logical_sectormask; + + /* LRU control structures */ + struct shrinker bt_shrinker; + struct list_lru bt_lru; +} xfs_buftarg_t; + +struct xfs_buf; +typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); + + +#define XB_PAGES 2 + +struct xfs_buf_map { + xfs_daddr_t bm_bn; /* block number for I/O */ + int bm_len; /* size of I/O */ +}; + +#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \ + struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; + +struct xfs_buf_ops { + void (*verify_read)(struct xfs_buf *); + void (*verify_write)(struct xfs_buf *); +}; + +typedef struct xfs_buf { + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ + struct rb_node b_rbnode; /* rbtree node */ + xfs_daddr_t b_bn; /* block number of buffer */ + int b_length; /* size of buffer in BBs */ + atomic_t b_hold; /* reference count */ + atomic_t b_lru_ref; /* lru reclaim ref count */ + xfs_buf_flags_t b_flags; /* status flags */ + struct semaphore b_sema; /* semaphore for lockables */ + + /* + * concurrent access to b_lru and b_lru_flags are protected by + * bt_lru_lock and not by b_sema + */ + struct list_head b_lru; /* lru list */ + spinlock_t b_lock; /* internal state lock */ + unsigned int b_state; /* internal state flags */ + wait_queue_head_t b_waiters; /* unpin waiters */ + struct list_head b_list; + struct xfs_perag *b_pag; /* contains rbtree root */ + xfs_buftarg_t *b_target; /* buffer target (device) */ + void *b_addr; /* virtual address of buffer */ + struct work_struct b_iodone_work; + xfs_buf_iodone_t b_iodone; /* I/O completion function */ + struct completion b_iowait; /* queue for I/O waiters */ + void *b_fspriv; + struct xfs_trans *b_transp; + struct page **b_pages; /* array of page pointers */ + struct page *b_page_array[XB_PAGES]; /* inline pages */ + struct xfs_buf_map *b_maps; /* compound buffer map */ + struct xfs_buf_map __b_map; /* inline compound buffer map */ + int b_map_count; + int b_io_length; /* IO size in BBs */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + unsigned short b_error; /* error code on I/O */ + const struct xfs_buf_ops *b_ops; + +#ifdef XFS_BUF_LOCK_TRACKING + int b_last_holder; +#endif +} xfs_buf_t; + +/* Finding and Reading Buffers */ +struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, struct xfs_buf *new_bp); + +static inline struct xfs_buf * +xfs_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_find(target, &map, 1, flags, NULL); +} + +struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); + +static inline struct xfs_buf * +xfs_buf_alloc( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return _xfs_buf_alloc(target, &map, 1, flags); +} + +struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags); +struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops); +void xfs_buf_readahead_map(struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + const struct xfs_buf_ops *ops); + +static inline struct xfs_buf * +xfs_buf_get( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_get_map(target, &map, 1, flags); +} + +static inline struct xfs_buf * +xfs_buf_read( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_read_map(target, &map, 1, flags, ops); +} + +static inline void +xfs_buf_readahead( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_buf_readahead_map(target, &map, 1, ops); +} + +struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); +void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); +int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); + +struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, + int flags); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t numblks, int flags, + const struct xfs_buf_ops *ops); +void xfs_buf_hold(struct xfs_buf *bp); + +/* Releasing Buffers */ +extern void xfs_buf_free(xfs_buf_t *); +extern void xfs_buf_rele(xfs_buf_t *); + +/* Locking and Unlocking Buffers */ +extern int xfs_buf_trylock(xfs_buf_t *); +extern void xfs_buf_lock(xfs_buf_t *); +extern void xfs_buf_unlock(xfs_buf_t *); +#define xfs_buf_islocked(bp) \ + ((bp)->b_sema.count <= 0) + +/* Buffer Read and Write Routines */ +extern int xfs_bwrite(struct xfs_buf *bp); +extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioerror(xfs_buf_t *, int); +extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); +extern void xfs_buf_iorequest(xfs_buf_t *); +extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, + xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) + +extern int xfs_bioerror_relse(struct xfs_buf *); + +/* Buffer Utility Routines */ +extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); + +/* Delayed Write Buffer Routines */ +extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); +extern int xfs_buf_delwri_submit(struct list_head *); +extern int xfs_buf_delwri_submit_nowait(struct list_head *); + +/* Buffer Daemon Setup Routines */ +extern int xfs_buf_init(void); +extern void xfs_buf_terminate(void); + +#define XFS_BUF_ZEROFLAGS(bp) \ + ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \ + XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \ + XBF_WRITE_FAIL)) + +void xfs_buf_stale(struct xfs_buf *bp); +#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) +#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) + +#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) +#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) +#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) + +#define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) +#define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) +#define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) + +#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) +#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) +#define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) + +#define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) +#define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) +#define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) + +/* + * These macros use the IO block map rather than b_bn. b_bn is now really + * just for the buffer cache index for cached buffers. As IO does not use b_bn + * anymore, uncached buffers do not use b_bn at all and hence must modify the IO + * map directly. Uncached buffers are not allowed to be discontiguous, so this + * is safe to do. + * + * In future, uncached buffers will pass the block number directly to the io + * request function and hence these macros will go away at that point. + */ +#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) +#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) + +static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) +{ + atomic_set(&bp->b_lru_ref, lru_ref); +} + +static inline int xfs_buf_ispinned(struct xfs_buf *bp) +{ + return atomic_read(&bp->b_pin_count); +} + +static inline void xfs_buf_relse(xfs_buf_t *bp) +{ + xfs_buf_unlock(bp); + xfs_buf_rele(bp); +} + +static inline int +xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +static inline void +xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) +{ + xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), + cksum_offset); +} + +/* + * Handling of buftargs. + */ +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); +extern void xfs_wait_buftarg(xfs_buftarg_t *); +extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); + +#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) +#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) + +#endif /* __XFS_BUF_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index a4aa53974f7..4654338b03f 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -17,125 +17,36 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_log.h" kmem_zone_t *xfs_buf_item_zone; -#ifdef XFS_TRANS_DEBUG -/* - * This function uses an alternate strategy for tracking the bytes - * that the user requests to be logged. This can then be used - * in conjunction with the bli_orig array in the buf log item to - * catch bugs in our callers' code. - * - * We also double check the bits set in xfs_buf_item_log using a - * simple algorithm to check that every byte is accounted for. - */ -STATIC void -xfs_buf_item_log_debug( - xfs_buf_log_item_t *bip, - uint first, - uint last) +static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) { - uint x; - uint byte; - uint nbytes; - uint chunk_num; - uint word_num; - uint bit_num; - uint bit_set; - uint *wordp; - - ASSERT(bip->bli_logged != NULL); - byte = first; - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); - for (x = 0; x < nbytes; x++) { - chunk_num = byte >> XFS_BLI_SHIFT; - word_num = chunk_num >> BIT_TO_WORD_SHIFT; - bit_num = chunk_num & (NBWORD - 1); - wordp = &(bip->bli_format.blf_data_map[word_num]); - bit_set = *wordp & (1 << bit_num); - ASSERT(bit_set); - byte++; - } + return container_of(lip, struct xfs_buf_log_item, bli_item); } -/* - * This function is called when we flush something into a buffer without - * logging it. This happens for things like inodes which are logged - * separately from the buffer. - */ -void -xfs_buf_item_flush_log_debug( - xfs_buf_t *bp, - uint first, - uint last) -{ - xfs_buf_log_item_t *bip; - uint nbytes; - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { - return; - } - - ASSERT(bip->bli_logged != NULL); - nbytes = last - first + 1; - bfset(bip->bli_logged, first, nbytes); -} +STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp); -/* - * This function is called to verify that our callers have logged - * all the bytes that they changed. - * - * It does this by comparing the original copy of the buffer stored in - * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which mismatch are set in the bli_logged - * array of the buf log item. - */ -STATIC void -xfs_buf_item_log_check( - xfs_buf_log_item_t *bip) +static inline int +xfs_buf_log_format_size( + struct xfs_buf_log_format *blfp) { - char *orig; - char *buffer; - int x; - xfs_buf_t *bp; - - ASSERT(bip->bli_orig != NULL); - ASSERT(bip->bli_logged != NULL); - - bp = bip->bli_buf; - ASSERT(XFS_BUF_COUNT(bp) > 0); - ASSERT(XFS_BUF_PTR(bp) != NULL); - orig = bip->bli_orig; - buffer = XFS_BUF_PTR(bp); - for (x = 0; x < XFS_BUF_COUNT(bp); x++) { - if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) - cmn_err(CE_PANIC, - "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", - bip, bp, orig, x); - } + return offsetof(struct xfs_buf_log_format, blf_data_map) + + (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -#else -#define xfs_buf_item_log_debug(x,y,z) -#define xfs_buf_item_log_check(x) -#endif - -STATIC void xfs_buf_error_relse(xfs_buf_t *bp); -STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); /* * This returns the number of log iovecs needed to log the @@ -147,34 +58,28 @@ STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); * * If the XFS_BLI_STALE flag has been set, then log nothing. */ -STATIC uint -xfs_buf_item_size( - xfs_buf_log_item_t *bip) +STATIC void +xfs_buf_item_size_segment( + struct xfs_buf_log_item *bip, + struct xfs_buf_log_format *blfp, + int *nvecs, + int *nbytes) { - uint nvecs; - int next_bit; - int last_bit; - xfs_buf_t *bp; + struct xfs_buf *bp = bip->bli_buf; + int next_bit; + int last_bit; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - if (bip->bli_flags & XFS_BLI_STALE) { - /* - * The buffer is stale, so all we need to log - * is the buf log format structure with the - * cancel flag in it. - */ - xfs_buf_item_trace("SIZE STALE", bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - return 1; - } + last_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (last_bit == -1) + return; + + /* + * initial count for a dirty buffer is 2 vectors - the format structure + * and the first dirty region. + */ + *nvecs += 2; + *nbytes += xfs_buf_log_format_size(blfp) + XFS_BLF_CHUNK; - bp = bip->bli_buf; - ASSERT(bip->bli_flags & XFS_BLI_LOGGED); - nvecs = 1; - last_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(last_bit != -1); - nvecs++; while (last_bit != -1) { /* * This takes the bit number to start looking from and @@ -182,76 +87,164 @@ xfs_buf_item_size( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + last_bit + 1); /* * If we run out of bits, leave the loop, * else if we find a new set of bits bump the number of vecs, * else keep scanning the current set of bits. */ if (next_bit == -1) { - last_bit = -1; + break; } else if (next_bit != last_bit + 1) { last_bit = next_bit; - nvecs++; - } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != - (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + - XFS_BLI_CHUNK)) { + (*nvecs)++; + } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) != + (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) + + XFS_BLF_CHUNK)) { last_bit = next_bit; - nvecs++; + (*nvecs)++; } else { last_bit++; } + *nbytes += XFS_BLF_CHUNK; } - - xfs_buf_item_trace("SIZE NORM", bip); - return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given log buf item. It fills the first entry with a buf log - * format structure, and the rest point to contiguous chunks - * within the buffer. + * This returns the number of log iovecs needed to log the given buf log item. + * + * It calculates this as 1 iovec for the buf log format structure and 1 for each + * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged + * in a single iovec. + * + * Discontiguous buffers need a format structure per region that that is being + * logged. This makes the changes in the buffer appear to log recovery as though + * they came from separate buffers, just like would occur if multiple buffers + * were used instead of a single discontiguous buffer. This enables + * discontiguous buffers to be in-memory constructs, completely transparent to + * what ends up on disk. + * + * If the XFS_BLI_STALE flag has been set, then log nothing but the buf log + * format structures. */ STATIC void -xfs_buf_item_format( - xfs_buf_log_item_t *bip, - xfs_log_iovec_t *log_vector) +xfs_buf_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (bip->bli_flags & XFS_BLI_STALE) { + /* + * The buffer is stale, so all we need to log + * is the buf log format structure with the + * cancel flag in it. + */ + trace_xfs_buf_item_size_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + *nvecs += bip->bli_format_count; + for (i = 0; i < bip->bli_format_count; i++) { + *nbytes += xfs_buf_log_format_size(&bip->bli_formats[i]); + } + return; + } + + ASSERT(bip->bli_flags & XFS_BLI_LOGGED); + + if (bip->bli_flags & XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. + * It is not being included in the transaction + * commit, so no vectors are used at all. + */ + trace_xfs_buf_item_size_ordered(bip); + *nvecs = XFS_LOG_VEC_ORDERED; + return; + } + + /* + * the vector count is based on the number of buffer vectors we have + * dirty bits in. This will only be greater than one when we have a + * compound buffer with more than one segment dirty. Hence for compound + * buffers we need to track which segment the dirty bits correspond to, + * and when we move from one segment to the next increment the vector + * count for the extra buf log format structure that will need to be + * written. + */ + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_size_segment(bip, &bip->bli_formats[i], + nvecs, nbytes); + } + trace_xfs_buf_item_size(bip); +} + +static inline void +xfs_buf_item_copy_iovec( + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + struct xfs_buf *bp, + uint offset, + int first_bit, + uint nbits) +{ + offset += first_bit * XFS_BLF_CHUNK; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK, + xfs_buf_offset(bp, offset), + nbits * XFS_BLF_CHUNK); +} + +static inline bool +xfs_buf_item_straddle( + struct xfs_buf *bp, + uint offset, + int next_bit, + int last_bit) { + return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) != + (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) + + XFS_BLF_CHUNK); +} + +static void +xfs_buf_item_format_segment( + struct xfs_buf_log_item *bip, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp, + uint offset, + struct xfs_buf_log_format *blfp) +{ + struct xfs_buf *bp = bip->bli_buf; uint base_size; - uint nvecs; - xfs_log_iovec_t *vecp; - xfs_buf_t *bp; int first_bit; int last_bit; int next_bit; uint nbits; - uint buffer_offset; - ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || - (bip->bli_flags & XFS_BLI_STALE)); - bp = bip->bli_buf; - ASSERT(XFS_BUF_BP_ISMAPPED(bp)); - vecp = log_vector; + /* copy the flags across from the base format item */ + blfp->blf_flags = bip->__bli_format.blf_flags; /* - * The size of the base structure is the size of the - * declared structure plus the space for the extra words - * of the bitmap. We subtract one from the map size, because - * the first element of the bitmap is accounted for in the - * size of the base structure. + * Base size is the actual size of the ondisk structure - it reflects + * the actual size of the dirty bitmap rather than the size of the in + * memory structure. */ - base_size = - (uint)(sizeof(xfs_buf_log_format_t) + - ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); - vecp->i_addr = (xfs_caddr_t)&bip->bli_format; - vecp->i_len = base_size; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); - vecp++; - nvecs = 1; + base_size = xfs_buf_log_format_size(blfp); + + first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); + if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) { + /* + * If the map is not be dirty in the transaction, mark + * the size as zero and do not advance the vector pointer. + */ + return; + } + + blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size); + blfp->blf_size = 1; if (bip->bli_flags & XFS_BLI_STALE) { /* @@ -259,18 +252,15 @@ xfs_buf_item_format( * is the buf log format structure with the * cancel flag in it. */ - xfs_buf_item_trace("FORMAT STALE", bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - bip->bli_format.blf_size = nvecs; + trace_xfs_buf_item_format_stale(bip); + ASSERT(blfp->blf_flags & XFS_BLF_CANCEL); return; } + /* * Fill in an iovec for each set of contiguous chunks. */ - first_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0); - ASSERT(first_bit != -1); last_bit = first_bit; nbits = 1; for (;;) { @@ -280,48 +270,25 @@ xfs_buf_item_format( * if there are no more bits set or the start bit is * beyond the end of the bitmap. */ - next_bit = xfs_next_bit(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, - (uint)last_bit + 1); + next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, + (uint)last_bit + 1); /* - * If we run out of bits fill in the last iovec and get - * out of the loop. - * Else if we start a new set of bits then fill in the - * iovec for the series we were looking at and start - * counting the bits in the new one. - * Else we're still in the same set of bits so just - * keep counting and scanning. + * If we run out of bits fill in the last iovec and get out of + * the loop. Else if we start a new set of bits then fill in + * the iovec for the series we were looking at and start + * counting the bits in the new one. Else we're still in the + * same set of bits so just keep counting and scanning. */ if (next_bit == -1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); - nvecs++; + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; break; - } else if (next_bit != last_bit + 1) { - buffer_offset = first_bit * XFS_BLI_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); - nvecs++; - vecp++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != - (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + - XFS_BLI_CHUNK)) { - buffer_offset = first_bit * XFS_BLI_CHUNK; - vecp->i_addr = xfs_buf_offset(bp, buffer_offset); - vecp->i_len = nbits * XFS_BLI_CHUNK; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); -/* You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ - vecp++; + } else if (next_bit != last_bit + 1 || + xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) { + xfs_buf_item_copy_iovec(lv, vecp, bp, offset, + first_bit, nbits); + blfp->blf_size++; first_bit = next_bit; last_bit = next_bit; nbits = 1; @@ -330,270 +297,363 @@ xfs_buf_item_format( nbits++; } } - bip->bli_format.blf_size = nvecs; +} + +/* + * This is called to fill in the vector of log iovecs for the + * given log buf item. It fills the first entry with a buf log + * format structure, and the rest point to contiguous chunks + * within the buffer. + */ +STATIC void +xfs_buf_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + struct xfs_log_iovec *vecp = NULL; + uint offset = 0; + int i; + + ASSERT(atomic_read(&bip->bli_refcount) > 0); + ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_STALE)); + + /* + * If it is an inode buffer, transfer the in-memory state to the + * format flags and clear the in-memory state. + * + * For buffer based inode allocation, we do not transfer + * this state if the inode buffer allocation has not yet been committed + * to the log as setting the XFS_BLI_INODE_BUF flag will prevent + * correct replay of the inode allocation. + * + * For icreate item based inode allocation, the buffers aren't written + * to the journal during allocation, and hence we should always tag the + * buffer as an inode buffer so that the correct unlinked list replay + * occurs during recovery. + */ + if (bip->bli_flags & XFS_BLI_INODE_BUF) { + if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) || + !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && + xfs_log_item_in_current_chkpt(lip))) + bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; + bip->bli_flags &= ~XFS_BLI_INODE_BUF; + } + + if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == + XFS_BLI_ORDERED) { + /* + * The buffer has been logged just to order it. It is not being + * included in the transaction commit, so don't format it. + */ + trace_xfs_buf_item_format_ordered(bip); + return; + } + + for (i = 0; i < bip->bli_format_count; i++) { + xfs_buf_item_format_segment(bip, lv, &vecp, offset, + &bip->bli_formats[i]); + offset += bp->b_maps[i].bm_len; + } /* * Check to make sure everything is consistent. */ - xfs_buf_item_trace("FORMAT NORM", bip); - xfs_buf_item_log_check(bip); + trace_xfs_buf_item_format(bip); } /* - * This is called to pin the buffer associated with the buf log - * item in memory so it cannot be written out. Simply call bpin() - * on the buffer to do this. + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. + * + * We also always take a reference to the buffer log item here so that the bli + * is held while the item is pinned in memory. This means that we can + * unconditionally drop the reference count a transaction holds when the + * transaction is completed. */ STATIC void xfs_buf_item_pin( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); - bp = bip->bli_buf; - ASSERT(XFS_BUF_ISBUSY(bp)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || + (bip->bli_flags & XFS_BLI_ORDERED) || (bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("PIN", bip); - xfs_buftrace("XFS_PIN", bp); - xfs_bpin(bp); -} + trace_xfs_buf_item_pin(bip); + + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); +} /* * This is called to unpin the buffer associated with the buf log * item which was previously pinned with a call to xfs_buf_item_pin(). - * Just call bunpin() on the buffer to do this. * * Also drop the reference to the buf item for the current transaction. * If the XFS_BLI_STALE flag is set and we are the last reference, * then free up the buf log item and unlock the buffer. + * + * If the remove flag is set we are called from uncommit in the + * forced-shutdown path. If that is true and the reference count on + * the log item is going to drop to zero we need to free the item's + * descriptor in the transaction. */ STATIC void xfs_buf_item_unpin( - xfs_buf_log_item_t *bip, - int stale) + struct xfs_log_item *lip, + int remove) { - xfs_mount_t *mp; - xfs_buf_t *bp; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + xfs_buf_t *bp = bip->bli_buf; + struct xfs_ail *ailp = lip->li_ailp; + int stale = bip->bli_flags & XFS_BLI_STALE; int freed; - SPLDECL(s); - bp = bip->bli_buf; - ASSERT(bp != NULL); - ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); + ASSERT(bp->b_fspriv == bip); ASSERT(atomic_read(&bip->bli_refcount) > 0); - xfs_buf_item_trace("UNPIN", bip); - xfs_buftrace("XFS_UNPIN", bp); + + trace_xfs_buf_item_unpin(bip); freed = atomic_dec_and_test(&bip->bli_refcount); - mp = bip->bli_item.li_mountp; - xfs_bunpin(bp); + + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + if (freed && stale) { ASSERT(bip->bli_flags & XFS_BLI_STALE); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); + ASSERT(xfs_buf_islocked(bp)); ASSERT(XFS_BUF_ISSTALE(bp)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - xfs_buf_item_trace("UNPIN STALE", bip); - xfs_buftrace("XFS_UNPIN STALE", bp); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + + trace_xfs_buf_item_unpin_stale(bip); + + if (remove) { + /* + * If we are in a transaction context, we have to + * remove the log item from the transaction as we are + * about to release our reference to the buffer. If we + * don't, the unlock that occurs later in + * xfs_trans_uncommit() will try to reference the + * buffer which we no longer have a hold on. + */ + if (lip->li_desc) + xfs_trans_del_item(lip); + + /* + * Since the transaction no longer refers to the buffer, + * the buffer should no longer refer to the transaction. + */ + bp->b_transp = NULL; + } + /* * If we get called here because of an IO error, we may - * or may not have the item on the AIL. xfs_trans_delete_ail() + * or may not have the item on the AIL. xfs_trans_ail_delete() * will take care of that situation. - * xfs_trans_delete_ail() drops the AIL lock. + * xfs_trans_ail_delete() drops the AIL lock. */ if (bip->bli_flags & XFS_BLI_STALE_INODE) { - xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; } else { - AIL_LOCK(mp,s); - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + ASSERT(bp->b_fspriv == NULL); } xfs_buf_relse(bp); - } -} - -/* - * this is called from uncommit in the forced-shutdown path. - * we need to check to see if the reference count on the log item - * is going to drop to zero. If so, unpin will free the log item - * so we need to free the item's descriptor (that points to the item) - * in the transaction. - */ -STATIC void -xfs_buf_item_unpin_remove( - xfs_buf_log_item_t *bip, - xfs_trans_t *tp) -{ - xfs_buf_t *bp; - xfs_log_item_desc_t *lidp; - int stale = 0; - - bp = bip->bli_buf; - /* - * will xfs_buf_item_unpin() call xfs_buf_item_relse()? - */ - if ((atomic_read(&bip->bli_refcount) == 1) && - (bip->bli_flags & XFS_BLI_STALE)) { - ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); - xfs_buf_item_trace("UNPIN REMOVE", bip); - xfs_buftrace("XFS_UNPIN_REMOVE", bp); + } else if (freed && remove) { /* - * yes -- clear the xaction descriptor in-use flag - * and free the chunk if required. We can safely - * do some work here and then call buf_item_unpin - * to do the rest because if the if is true, then - * we are holding the buffer locked so no one else - * will be able to bump up the refcount. + * There are currently two references to the buffer - the active + * LRU reference and the buf log item. What we are about to do + * here - simulate a failed IO completion - requires 3 + * references. + * + * The LRU reference is removed by the xfs_buf_stale() call. The + * buf item reference is removed by the xfs_buf_iodone() + * callback that is run by xfs_buf_do_callbacks() during ioend + * processing (via the bp->b_iodone callback), and then finally + * the ioend processing will drop the IO reference if the buffer + * is marked XBF_ASYNC. + * + * Hence we need to take an additional reference here so that IO + * completion processing doesn't free the buffer prematurely. */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); - stale = lidp->lid_flags & XFS_LID_BUF_STALE; - xfs_trans_free_item(tp, lidp); - /* - * Since the transaction no longer refers to the buffer, - * the buffer should no longer refer to the transaction. - */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); + xfs_buf_lock(bp); + xfs_buf_hold(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioerror(bp, EIO); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); } - - xfs_buf_item_unpin(bip, stale); - - return; } /* - * This is called to attempt to lock the buffer associated with this - * buf log item. Don't sleep on the buffer lock. If we can't get - * the lock right away, return 0. If we can get the lock, pull the - * buffer from the free list, mark it busy, and return 1. + * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30 + * seconds so as to not spam logs too much on repeated detection of the same + * buffer being bad.. */ + +DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); + STATIC uint -xfs_buf_item_trylock( - xfs_buf_log_item_t *bip) +xfs_buf_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - xfs_buf_t *bp; - - bp = bip->bli_buf; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + uint rval = XFS_ITEM_SUCCESS; - if (XFS_BUF_ISPINNED(bp)) { + if (xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED; - } - - if (!XFS_BUF_CPSEMA(bp)) { + if (!xfs_buf_trylock(bp)) { + /* + * If we have just raced with a buffer being pinned and it has + * been marked stale, we could end up stalling until someone else + * issues a log force to unpin the stale buffer. Check for the + * race condition here so xfsaild recognizes the buffer is pinned + * and queues a log force to move it along. + */ + if (xfs_buf_ispinned(bp)) + return XFS_ITEM_PINNED; return XFS_ITEM_LOCKED; } - /* - * Remove the buffer from the free list. Only do this - * if it's on the free list. Private buffers like the - * superblock buffer are not. - */ - XFS_BUF_HOLD(bp); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("TRYLOCK SUCCESS", bip); - return XFS_ITEM_SUCCESS; + + trace_xfs_buf_item_push(bip); + + /* has a previous flush failed due to IO errors? */ + if ((bp->b_flags & XBF_WRITE_FAIL) && + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + xfs_warn(bp->b_target->bt_mount, +"Detected failing async write on buffer block 0x%llx. Retrying async write.\n", + (long long)bp->b_bn); + } + + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_unlock(bp); + return rval; } /* - * Release the buffer associated with the buf log item. - * If there is no dirty logged data associated with the - * buffer recorded in the buf log item, then free the - * buf log item and remove the reference to it in the - * buffer. + * Release the buffer associated with the buf log item. If there is no dirty + * logged data associated with the buffer recorded in the buf log item, then + * free the buf log item and remove the reference to it in the buffer. * - * This call ignores the recursion count. It is only called - * when the buffer should REALLY be unlocked, regardless - * of the recursion count. + * This call ignores the recursion count. It is only called when the buffer + * should REALLY be unlocked, regardless of the recursion count. * - * If the XFS_BLI_HOLD flag is set in the buf log item, then - * free the log item if necessary but do not unlock the buffer. - * This is for support of xfs_trans_bhold(). Make sure the - * XFS_BLI_HOLD field is cleared if we don't free the item. + * We unconditionally drop the transaction's reference to the log item. If the + * item was logged, then another reference was taken when it was pinned, so we + * can safely drop the transaction reference now. This also allows us to avoid + * potential races with the unpin code freeing the bli by not referencing the + * bli after we've dropped the reference count. + * + * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item + * if necessary but do not unlock the buffer. This is for support of + * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't + * free the item. */ STATIC void xfs_buf_item_unlock( - xfs_buf_log_item_t *bip) + struct xfs_log_item *lip) { - int aborted; - xfs_buf_t *bp; - uint hold; + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + struct xfs_buf *bp = bip->bli_buf; + bool clean; + bool aborted; + int flags; - bp = bip->bli_buf; - xfs_buftrace("XFS_UNLOCK", bp); + /* Clear the buffer's association with this transaction. */ + bp->b_transp = NULL; /* - * Clear the buffer's association with this transaction. + * If this is a transaction abort, don't return early. Instead, allow + * the brelse to happen. Normally it would be done for stale + * (cancelled) buffers at unpin time, but we'll never go through the + * pin/unpin cycle if we abort inside commit. */ - XFS_BUF_SET_FSPRIVATE2(bp, NULL); - + aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; /* - * If this is a transaction abort, don't return early. - * Instead, allow the brelse to happen. - * Normally it would be done for stale (cancelled) buffers - * at unpin time, but we'll never go through the pin/unpin - * cycle if we abort inside commit. + * Before possibly freeing the buf item, copy the per-transaction state + * so we can reference it safely later after clearing it from the + * buffer log item. */ - aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; + flags = bip->bli_flags; + bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* - * If the buf item is marked stale, then don't do anything. - * We'll unlock the buffer and free the buf item when the - * buffer is unpinned for the last time. + * If the buf item is marked stale, then don't do anything. We'll + * unlock the buffer and free the buf item when the buffer is unpinned + * for the last time. */ - if (bip->bli_flags & XFS_BLI_STALE) { - bip->bli_flags &= ~XFS_BLI_LOGGED; - xfs_buf_item_trace("UNLOCK STALE", bip); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - if (!aborted) + if (flags & XFS_BLI_STALE) { + trace_xfs_buf_item_unlock_stale(bip); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + if (!aborted) { + atomic_dec(&bip->bli_refcount); return; + } } - /* - * Drop the transaction's reference to the log item if - * it was not logged as part of the transaction. Otherwise - * we'll drop the reference in xfs_buf_item_unpin() when - * the transaction is really through with the buffer. - */ - if (!(bip->bli_flags & XFS_BLI_LOGGED)) { - atomic_dec(&bip->bli_refcount); - } else { - /* - * Clear the logged flag since this is per - * transaction state. - */ - bip->bli_flags &= ~XFS_BLI_LOGGED; - } + trace_xfs_buf_item_unlock(bip); /* - * Before possibly freeing the buf item, determine if we should - * release the buffer at the end of this routine. + * If the buf item isn't tracking any data, free it, otherwise drop the + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. + * + * Ordered buffers are dirty but may have no recorded changes, so ensure + * we only release clean items here. */ - hold = bip->bli_flags & XFS_BLI_HOLD; - xfs_buf_item_trace("UNLOCK", bip); + clean = (flags & XFS_BLI_DIRTY) ? false : true; + if (clean) { + int i; + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) { + clean = false; + break; + } + } + } /* - * If the buf item isn't tracking any data, free it. - * Otherwise, if XFS_BLI_HOLD is set clear it. + * Clean buffers, by definition, cannot be in the AIL. However, aborted + * buffers may be dirty and hence in the AIL. Therefore if we are + * aborting a buffer and we've just taken the last refernce away, we + * have to check if it is in the AIL before freeing it. We need to free + * it in this case, because an aborted transaction has already shut the + * filesystem down and this is the last chance we will have to do so. */ - if (xfs_count_bits(bip->bli_format.blf_data_map, - bip->bli_format.blf_map_size, 0) == 0) { - xfs_buf_item_relse(bp); - } else if (hold) { - bip->bli_flags &= ~XFS_BLI_HOLD; + if (atomic_dec_and_test(&bip->bli_refcount)) { + if (clean) + xfs_buf_item_relse(bp); + else if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&lip->li_ailp->xa_lock); + xfs_trans_ail_delete(lip->li_ailp, lip, + SHUTDOWN_LOG_IO_ERROR); + } + xfs_buf_item_relse(bp); + } } - /* - * Release the buffer if XFS_BLI_HOLD was not set. - */ - if (!hold) { + if (!(flags & XFS_BLI_HOLD)) xfs_buf_relse(bp); - } } /* @@ -616,91 +676,70 @@ xfs_buf_item_unlock( */ STATIC xfs_lsn_t xfs_buf_item_committed( - xfs_buf_log_item_t *bip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - xfs_buf_item_trace("COMMITTED", bip); - if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && - (bip->bli_item.li_lsn != 0)) { - return bip->bli_item.li_lsn; - } - return (lsn); + struct xfs_buf_log_item *bip = BUF_ITEM(lip); + + trace_xfs_buf_item_committed(bip); + + if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && lip->li_lsn != 0) + return lip->li_lsn; + return lsn; } -/* - * This is called when the transaction holding the buffer is aborted. - * Just behave as if the transaction had been cancelled. If we're shutting down - * and have aborted this transaction, we'll trap this buffer when it tries to - * get written out. - */ STATIC void -xfs_buf_item_abort( - xfs_buf_log_item_t *bip) +xfs_buf_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) { - xfs_buf_t *bp; - - bp = bip->bli_buf; - xfs_buftrace("XFS_ABORT", bp); - XFS_BUF_SUPER_STALE(bp); - xfs_buf_item_unlock(bip); - return; } /* - * This is called to asynchronously write the buffer associated with this - * buf log item out to disk. The buffer will already have been locked by - * a successful call to xfs_buf_item_trylock(). If the buffer still has - * B_DELWRI set, then get it going out to disk with a call to bawrite(). - * If not, then just release the buffer. + * This is the ops vector shared by all buf log items. */ -STATIC void -xfs_buf_item_push( - xfs_buf_log_item_t *bip) -{ - xfs_buf_t *bp; - - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - xfs_buf_item_trace("PUSH", bip); +static const struct xfs_item_ops xfs_buf_item_ops = { + .iop_size = xfs_buf_item_size, + .iop_format = xfs_buf_item_format, + .iop_pin = xfs_buf_item_pin, + .iop_unpin = xfs_buf_item_unpin, + .iop_unlock = xfs_buf_item_unlock, + .iop_committed = xfs_buf_item_committed, + .iop_push = xfs_buf_item_push, + .iop_committing = xfs_buf_item_committing +}; - bp = bip->bli_buf; +STATIC int +xfs_buf_item_get_format( + struct xfs_buf_log_item *bip, + int count) +{ + ASSERT(bip->bli_formats == NULL); + bip->bli_format_count = count; - if (XFS_BUF_ISDELAYWRITE(bp)) { - xfs_bawrite(bip->bli_item.li_mountp, bp); - } else { - xfs_buf_relse(bp); + if (count == 1) { + bip->bli_formats = &bip->__bli_format; + return 0; } + + bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), + KM_SLEEP); + if (!bip->bli_formats) + return ENOMEM; + return 0; } -/* ARGSUSED */ STATIC void -xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) +xfs_buf_item_free_format( + struct xfs_buf_log_item *bip) { + if (bip->bli_formats != &bip->__bli_format) { + kmem_free(bip->bli_formats); + bip->bli_formats = NULL; + } } /* - * This is the ops vector shared by all buf log items. - */ -STATIC struct xfs_item_ops xfs_buf_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_buf_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_buf_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_buf_item_abort, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_buf_item_committing -}; - - -/* * Allocate a new buf log item to go with the given buffer. * Set the buffer's b_fsprivate field to point to the new * buf log item. If there are other item's attached to the @@ -712,10 +751,12 @@ xfs_buf_item_init( xfs_buf_t *bp, xfs_mount_t *mp) { - xfs_log_item_t *lip; + xfs_log_item_t *lip = bp->b_fspriv; xfs_buf_log_item_t *bip; int chunks; int map_size; + int error; + int i; /* * Check to see if there is already a buf log item for @@ -723,62 +764,45 @@ xfs_buf_item_init( * the first. If we do already have one, there is * nothing to do here so return. */ - if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) - XFS_BUF_SET_FSPRIVATE3(bp, mp); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - if (lip->li_type == XFS_LI_BUF) { - return; - } - } + ASSERT(bp->b_target->bt_mount == mp); + if (lip != NULL && lip->li_type == XFS_LI_BUF) + return; - /* - * chunks is the number of XFS_BLI_CHUNK size pieces - * the buffer can be divided into. Make sure not to - * truncate any pieces. map_size is the size of the - * bitmap needed to describe the chunks of the buffer. - */ - chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); - map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); - - bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, - KM_SLEEP); - bip->bli_item.li_type = XFS_LI_BUF; - bip->bli_item.li_ops = &xfs_buf_item_ops; - bip->bli_item.li_mountp = mp; + bip = kmem_zone_zalloc(xfs_buf_item_zone, KM_SLEEP); + xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; - bip->bli_format.blf_type = XFS_LI_BUF; - bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); - bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); - bip->bli_format.blf_map_size = map_size; -#ifdef XFS_BLI_TRACE - bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP); -#endif - -#ifdef XFS_TRANS_DEBUG + xfs_buf_hold(bp); + /* - * Allocate the arrays for tracking what needs to be logged - * and what our callers request to be logged. bli_orig - * holds a copy of the original, clean buffer for comparison - * against, and bli_logged keeps a 1 bit flag per byte in - * the buffer to indicate which bytes the callers have asked - * to have logged. + * chunks is the number of XFS_BLF_CHUNK size pieces the buffer + * can be divided into. Make sure not to truncate any pieces. + * map_size is the size of the bitmap needed to describe the + * chunks of the buffer. + * + * Discontiguous buffer support follows the layout of the underlying + * buffer. This makes the implementation as simple as possible. */ - bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); - memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); - bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); -#endif + error = xfs_buf_item_get_format(bip, bp->b_map_count); + ASSERT(error == 0); + + for (i = 0; i < bip->bli_format_count; i++) { + chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len), + XFS_BLF_CHUNK); + map_size = DIV_ROUND_UP(chunks, NBWORD); + + bip->bli_formats[i].blf_type = XFS_LI_BUF; + bip->bli_formats[i].blf_blkno = bp->b_maps[i].bm_bn; + bip->bli_formats[i].blf_len = bp->b_maps[i].bm_len; + bip->bli_formats[i].blf_map_size = map_size; + } /* * Put the buf item into the list of items attached to the * buffer at the front. */ - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - bip->bli_item.li_bio_list = - XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - } - XFS_BUF_SET_FSPRIVATE(bp, bip); + if (bp->b_fspriv) + bip->bli_item.li_bio_list = bp->b_fspriv; + bp->b_fspriv = bip; } @@ -786,11 +810,11 @@ xfs_buf_item_init( * Mark bytes first through last inclusive as dirty in the buf * item's bitmap. */ -void -xfs_buf_item_log( - xfs_buf_log_item_t *bip, +static void +xfs_buf_item_log_segment( uint first, - uint last) + uint last, + uint *map) { uint first_bit; uint last_bit; @@ -803,16 +827,10 @@ xfs_buf_item_log( uint mask; /* - * Mark the item as having some dirty data for - * quick reference in xfs_buf_item_dirty. - */ - bip->bli_flags |= XFS_BLI_DIRTY; - - /* * Convert byte offsets to bit numbers. */ - first_bit = first >> XFS_BLI_SHIFT; - last_bit = last >> XFS_BLI_SHIFT; + first_bit = first >> XFS_BLF_SHIFT; + last_bit = last >> XFS_BLF_SHIFT; /* * Calculate the total number of bits to be set. @@ -824,7 +842,7 @@ xfs_buf_item_log( * to set a bit in. */ word_num = first_bit >> BIT_TO_WORD_SHIFT; - wordp = &(bip->bli_format.blf_data_map[word_num]); + wordp = &map[word_num]; /* * Calculate the starting bit in the first word. @@ -867,13 +885,50 @@ xfs_buf_item_log( mask = (1 << end_bit) - 1; *wordp |= mask; } +} - xfs_buf_item_log_debug(bip, first, last); +/* + * Mark bytes first through last inclusive as dirty in the buf + * item's bitmap. + */ +void +xfs_buf_item_log( + xfs_buf_log_item_t *bip, + uint first, + uint last) +{ + int i; + uint start; + uint end; + struct xfs_buf *bp = bip->bli_buf; + + /* + * walk each buffer segment and mark them dirty appropriately. + */ + start = 0; + for (i = 0; i < bip->bli_format_count; i++) { + if (start > last) + break; + end = start + BBTOB(bp->b_maps[i].bm_len); + if (first > end) { + start += BBTOB(bp->b_maps[i].bm_len); + continue; + } + if (first < start) + first = start; + if (end > last) + end = last; + + xfs_buf_item_log_segment(first, end, + &bip->bli_formats[i].blf_data_map[0]); + + start += bp->b_maps[i].bm_len; + } } /* - * Return 1 if the buffer has some data that has been logged (at any + * Return 1 if the buffer has been logged or ordered in a transaction (at any * point, not just the current transaction) and 0 if not. */ uint @@ -883,6 +938,14 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ + xfs_buf_item_free_format(bip); + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -894,28 +957,17 @@ void xfs_buf_item_relse( xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; - xfs_buftrace("XFS_RELSE", bp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); - if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && - (XFS_BUF_IODONE_FUNC(bp) != NULL)) { - ASSERT((XFS_BUF_ISUNINITIAL(bp)) == 0); - XFS_BUF_CLR_IODONE_FUNC(bp); - } + trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ + bp->b_fspriv = bip->bli_item.li_bio_list; + if (bp->b_fspriv == NULL) + bp->b_iodone = NULL; -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -936,32 +988,42 @@ xfs_buf_attach_iodone( { xfs_log_item_t *head_lip; - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); lip->li_cb = cb; - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + head_lip = bp->b_fspriv; + if (head_lip) { lip->li_bio_list = head_lip->li_bio_list; head_lip->li_bio_list = lip; } else { - XFS_BUF_SET_FSPRIVATE(bp, lip); + bp->b_fspriv = lip; } - ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || - (XFS_BUF_IODONE_FUNC(bp) == NULL)); - XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); + bp->b_iodone = xfs_buf_iodone_callbacks; } +/* + * We can have many callbacks on a buffer. Running the callbacks individually + * can cause a lot of contention on the AIL lock, so we allow for a single + * callback to be able to scan the remaining lip->li_bio_list for other items + * of the same type and callback to be processed in the first call. + * + * As a result, the loop walking the callback list below will also modify the + * list. it removes the first item from the list and then runs the callback. + * The loop then restarts from the new head of the list. This allows the + * callback to scan and modify the list attached to the buffer and we don't + * have to care about maintaining a next item pointer. + */ STATIC void xfs_buf_do_callbacks( - xfs_buf_t *bp, - xfs_log_item_t *lip) + struct xfs_buf *bp) { - xfs_log_item_t *nlip; + struct xfs_log_item *lip; - while (lip != NULL) { - nlip = lip->li_bio_list; + while ((lip = bp->b_fspriv) != NULL) { + bp->b_fspriv = lip->li_bio_list; ASSERT(lip->li_cb != NULL); /* * Clear the next pointer so we don't have any @@ -971,7 +1033,6 @@ xfs_buf_do_callbacks( */ lip->li_bio_list = NULL; lip->li_cb(bp, lip); - lip = nlip; } } @@ -984,141 +1045,78 @@ xfs_buf_do_callbacks( */ void xfs_buf_iodone_callbacks( - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_log_item_t *lip; - static ulong lasttime; - static xfs_buftarg_t *lasttarg; - xfs_mount_t *mp; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_mount *mp = lip->li_mountp; + static ulong lasttime; + static xfs_buftarg_t *lasttarg; - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + if (likely(!bp->b_error)) + goto do_callbacks; - if (XFS_BUF_GETERROR(bp) != 0) { - /* - * If we've already decided to shutdown the filesystem - * because of IO errors, there's no point in giving this - * a retry. - */ - mp = lip->li_mountp; - if (XFS_FORCED_SHUTDOWN(mp)) { - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - XFS_BUF_SUPER_STALE(bp); - xfs_buftrace("BUF_IODONE_CB", bp); - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); + /* + * If we've already decided to shutdown the filesystem because of + * I/O errors, there's no point in giving this a retry. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + trace_xfs_buf_item_iodone(bp, _RET_IP_); + goto do_callbacks; + } - /* - * XFS_SHUT flag gets set when we go thru the - * entire buffer cache and deliberately start - * throwing away delayed write buffers. - * Since there's no biowait done on those, - * we should just brelse them. - */ - if (XFS_BUF_ISSHUT(bp)) { - XFS_BUF_UNSHUT(bp); - xfs_buf_relse(bp); - } else { - xfs_biodone(bp); - } + if (bp->b_target != lasttarg || + time_after(jiffies, (lasttime + 5*HZ))) { + lasttime = jiffies; + xfs_buf_ioerror_alert(bp, __func__); + } + lasttarg = bp->b_target; - return; - } + /* + * If the write was asynchronous then no one will be looking for the + * error. Clear the error state and write the buffer out again. + * + * XXX: This helps against transient write errors, but we need to find + * a way to shut the filesystem down if the writes keep failing. + * + * In practice we'll shut the filesystem down soon as non-transient + * erorrs tend to affect the whole device and a failing log write + * will make us give up. But we really ought to do better here. + */ + if (XFS_BUF_ISASYNC(bp)) { + ASSERT(bp->b_iodone != NULL); - if ((XFS_BUF_TARGET(bp) != lasttarg) || - (time_after(jiffies, (lasttime + 5*HZ)))) { - lasttime = jiffies; - cmn_err(CE_ALERT, "Device %s, XFS metadata write error" - " block 0x%llx in %s", - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); - } - lasttarg = XFS_BUF_TARGET(bp); + trace_xfs_buf_item_iodone_async(bp, _RET_IP_); - if (XFS_BUF_ISASYNC(bp)) { - /* - * If the write was asynchronous then noone will be - * looking for the error. Clear the error state - * and write the buffer out again delayed write. - * - * XXXsup This is OK, so long as we catch these - * before we start the umount; we don't want these - * DELWRI metadata bufs to be hanging around. - */ - XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ + xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ - if (!(XFS_BUF_ISSTALE(bp))) { - XFS_BUF_DELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_SET_START(bp); - } - ASSERT(XFS_BUF_IODONE_FUNC(bp)); - xfs_buftrace("BUF_IODONE ASYNC", bp); - xfs_buf_relse(bp); + if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { + bp->b_flags |= XBF_WRITE | XBF_ASYNC | + XBF_DONE | XBF_WRITE_FAIL; + xfs_buf_iorequest(bp); } else { - /* - * If the write of the buffer was not asynchronous, - * then we want to make sure to return the error - * to the caller of bwrite(). Because of this we - * cannot clear the B_ERROR state at this point. - * Instead we install a callback function that - * will be called when the buffer is released, and - * that routine will clear the error state and - * set the buffer to be written out again after - * some delay. - */ - /* We actually overwrite the existing b-relse - function at times, but we're gonna be shutting down - anyway. */ - XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); - XFS_BUF_DONE(bp); - XFS_BUF_V_IODONESEMA(bp); + xfs_buf_relse(bp); } + return; } -#ifdef XFSERRORDEBUG - xfs_buftrace("XFS BUFCB NOERR", bp); -#endif - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); -} -/* - * This is a callback routine attached to a buffer which gets an error - * when being written out synchronously. - */ -STATIC void -xfs_buf_error_relse( - xfs_buf_t *bp) -{ - xfs_log_item_t *lip; - xfs_mount_t *mp; - - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - mp = (xfs_mount_t *)lip->li_mountp; - ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); - - XFS_BUF_STALE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_ERROR(bp,0); - xfs_buftrace("BUF_ERROR_RELSE", bp); - if (! XFS_FORCED_SHUTDOWN(mp)) - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); /* - * We have to unpin the pinned buffers so do the - * callbacks. + * If the write of the buffer was synchronous, we want to make + * sure to return the error to the caller of xfs_bwrite(). */ - xfs_buf_do_callbacks(bp, lip); - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_SET_BRELSE_FUNC(bp,NULL); - xfs_buf_relse(bp); -} + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + + trace_xfs_buf_error_relse(bp, _RET_IP_); +do_callbacks: + xfs_buf_do_callbacks(bp); + bp->b_fspriv = NULL; + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); +} /* * This is the iodone() function for buffers which have been @@ -1127,74 +1125,27 @@ xfs_buf_error_relse( * It is called by xfs_buf_iodone_callbacks() above which will take * care of cleaning up the buffer itself. */ -/* ARGSUSED */ void xfs_buf_iodone( - xfs_buf_t *bp, - xfs_buf_log_item_t *bip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - struct xfs_mount *mp; - SPLDECL(s); + struct xfs_ail *ailp = lip->li_ailp; - ASSERT(bip->bli_buf == bp); + ASSERT(BUF_ITEM(lip)->bli_buf == bp); - mp = bip->bli_item.li_mountp; + xfs_buf_rele(bp); /* * If we are forcibly shutting down, this may well be * off the AIL already. That's because we simulate the * log-committed callbacks to unpin these buffers. Or we may never * have put this item on AIL because of the transaction was - * aborted forcibly. xfs_trans_delete_ail() takes care of these. + * aborted forcibly. xfs_trans_ail_delete() takes care of these. * * Either way, AIL is useless if we're forcing a shutdown. */ - AIL_LOCK(mp,s); - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp)); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); -} - -#if defined(XFS_BLI_TRACE) -void -xfs_buf_item_trace( - char *id, - xfs_buf_log_item_t *bip) -{ - xfs_buf_t *bp; - ASSERT(bip->bli_trace != NULL); - - bp = bip->bli_buf; - ktrace_enter(bip->bli_trace, - (void *)id, - (void *)bip->bli_buf, - (void *)((unsigned long)bip->bli_flags), - (void *)((unsigned long)bip->bli_recur), - (void *)((unsigned long)atomic_read(&bip->bli_refcount)), - (void *)((unsigned long) - (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)), - (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))), - (void *)((unsigned long)XFS_BUF_COUNT(bp)), - (void *)((unsigned long)XFS_BUF_BFLAGS(bp)), - XFS_BUF_FSPRIVATE(bp, void *), - XFS_BUF_FSPRIVATE2(bp, void *), - (void *)(unsigned long)XFS_BUF_ISPINNED(bp), - (void *)XFS_BUF_IODONE_FUNC(bp), - (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))), - (void *)bip->bli_item.li_desc, - (void *)((unsigned long)bip->bli_item.li_flags)); + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + xfs_buf_item_free(BUF_ITEM(lip)); } -#endif /* XFS_BLI_TRACE */ diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 07c708c2b52..3f3455a4151 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -18,88 +18,33 @@ #ifndef __XFS_BUF_ITEM_H__ #define __XFS_BUF_ITEM_H__ -/* - * This is the structure used to lay out a buf log item in the - * log. The data map describes which 128 byte chunks of the buffer - * have been logged. This structure works only on buffers that - * reside up to the first TB in the filesystem. These buffers are - * generated only by pre-6.2 systems and are known as XFS_LI_6_1_BUF. - */ -typedef struct xfs_buf_log_format_v1 { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - __int32_t blf_blkno; /* starting blkno of this buf */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_v1_t; - -/* - * This is a form of the above structure with a 64 bit blkno field. - * For 6.2 and beyond, this is XFS_LI_BUF. We use this to log everything. - */ -typedef struct xfs_buf_log_format_t { - unsigned short blf_type; /* buf log item type indicator */ - unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ - __int64_t blf_blkno; /* starting blkno of this buf */ - unsigned int blf_map_size; /* size of data bitmap in words */ - unsigned int blf_data_map[1];/* variable size bitmap of */ - /* regions of buffer in this item */ -} xfs_buf_log_format_t; +/* kernel only definitions */ -/* - * This flag indicates that the buffer contains on disk inodes - * and requires special recovery handling. - */ -#define XFS_BLI_INODE_BUF 0x1 -/* - * This flag indicates that the buffer should not be replayed - * during recovery because its blocks are being freed. - */ -#define XFS_BLI_CANCEL 0x2 -/* - * This flag indicates that the buffer contains on disk - * user or group dquots and may require special recovery handling. - */ -#define XFS_BLI_UDQUOT_BUF 0x4 -#define XFS_BLI_PDQUOT_BUF 0x8 -#define XFS_BLI_GDQUOT_BUF 0x10 - -#define XFS_BLI_CHUNK 128 -#define XFS_BLI_SHIFT 7 -#define BIT_TO_WORD_SHIFT 5 -#define NBWORD (NBBY * sizeof(unsigned int)) - -/* - * buf log item flags - */ +/* buf log item flags */ #define XFS_BLI_HOLD 0x01 #define XFS_BLI_DIRTY 0x02 #define XFS_BLI_STALE 0x04 #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 #define XFS_BLI_STALE_INODE 0x20 +#define XFS_BLI_INODE_BUF 0x40 +#define XFS_BLI_ORDERED 0x80 +#define XFS_BLI_FLAGS \ + { XFS_BLI_HOLD, "HOLD" }, \ + { XFS_BLI_DIRTY, "DIRTY" }, \ + { XFS_BLI_STALE, "STALE" }, \ + { XFS_BLI_LOGGED, "LOGGED" }, \ + { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ + { XFS_BLI_STALE_INODE, "STALE_INODE" }, \ + { XFS_BLI_INODE_BUF, "INODE_BUF" }, \ + { XFS_BLI_ORDERED, "ORDERED" } -#ifdef __KERNEL__ struct xfs_buf; -struct ktrace; struct xfs_mount; struct xfs_buf_log_item; -#if defined(XFS_BLI_TRACE) -#define XFS_BLI_TRACE_SIZE 32 - -void xfs_buf_item_trace(char *, struct xfs_buf_log_item *); -#else -#define xfs_buf_item_trace(id, bip) -#endif - /* * This is the in core log item structure used to track information * needed to log buffers. It tracks how many times the lock has been @@ -111,27 +56,11 @@ typedef struct xfs_buf_log_item { unsigned int bli_flags; /* misc flags */ unsigned int bli_recur; /* lock recursion count */ atomic_t bli_refcount; /* cnt of tp refs */ -#ifdef XFS_BLI_TRACE - struct ktrace *bli_trace; /* event trace buf */ -#endif -#ifdef XFS_TRANS_DEBUG - char *bli_orig; /* original buffer copy */ - char *bli_logged; /* bytes logged (bitmap) */ -#endif - xfs_buf_log_format_t bli_format; /* in-log header */ + int bli_format_count; /* count of headers */ + struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */ + struct xfs_buf_log_format __bli_format; /* embedded in-log header */ } xfs_buf_log_item_t; -/* - * This structure is used during recovery to record the buf log - * items which have been canceled and should not be replayed. - */ -typedef struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; - uint bc_len; - int bc_refcount; - struct xfs_buf_cancel *bc_next; -} xfs_buf_cancel_t; - void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); @@ -140,18 +69,8 @@ void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); -void xfs_buf_iodone(struct xfs_buf *, xfs_buf_log_item_t *); - -#ifdef XFS_TRANS_DEBUG -void -xfs_buf_item_flush_log_debug( - struct xfs_buf *bp, - uint first, - uint last); -#else -#define xfs_buf_item_flush_log_debug(bp, first, last) -#endif +void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); -#endif /* __KERNEL__ */ +extern kmem_zone_t *xfs_buf_item_zone; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h deleted file mode 100644 index 7a0e482dd43..00000000000 --- a/fs/xfs/xfs_cap.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CAP_H__ -#define __XFS_CAP_H__ - -/* - * Capabilities - */ -typedef __uint64_t xfs_cap_value_t; - -typedef struct xfs_cap_set { - xfs_cap_value_t cap_effective; /* use in capability checks */ - xfs_cap_value_t cap_permitted; /* combined with file attrs */ - xfs_cap_value_t cap_inheritable;/* pass through exec */ -} xfs_cap_set_t; - -/* On-disk XFS extended attribute names */ -#define SGI_CAP_FILE "SGI_CAP_FILE" -#define SGI_CAP_FILE_SIZE (sizeof(SGI_CAP_FILE)-1) -#define SGI_CAP_LINUX "SGI_CAP_LINUX" -#define SGI_CAP_LINUX_SIZE (sizeof(SGI_CAP_LINUX)-1) - -/* - * For Linux, we take the bitfields directly from capability.h - * and no longer attempt to keep this attribute ondisk compatible - * with IRIX. Since this attribute is only set on executables, - * it just doesn't make much sense to try. We do use a different - * named attribute though, to avoid confusion. - */ - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_CAP - -#include <linux/posix_cap_xattr.h> - -struct bhv_vnode; - -extern int xfs_cap_vhascap(struct bhv_vnode *); -extern int xfs_cap_vset(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vget(struct bhv_vnode *, void *, size_t); -extern int xfs_cap_vremove(struct bhv_vnode *); - -#define _CAP_EXISTS xfs_cap_vhascap - -#else -#define xfs_cap_vset(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vget(v,p,sz) (-EOPNOTSUPP) -#define xfs_cap_vremove(v) (-EOPNOTSUPP) -#define _CAP_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_CAP_H__ */ diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h new file mode 100644 index 00000000000..fad1676ad8c --- /dev/null +++ b/fs/xfs/xfs_cksum.h @@ -0,0 +1,63 @@ +#ifndef _XFS_CKSUM_H +#define _XFS_CKSUM_H 1 + +#define XFS_CRC_SEED (~(__uint32_t)0) + +/* + * Calculate the intermediate checksum for a buffer that has the CRC field + * inside it. The offset of the 32bit crc fields is passed as the + * cksum_offset parameter. + */ +static inline __uint32_t +xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t zero = 0; + __uint32_t crc; + + /* Calculate CRC up to the checksum. */ + crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset); + + /* Skip checksum field */ + crc = crc32c(crc, &zero, sizeof(__u32)); + + /* Calculate the rest of the CRC. */ + return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)], + length - (cksum_offset + sizeof(__be32))); +} + +/* + * Convert the intermediate checksum to the final ondisk format. + * + * The CRC32c calculation uses LE format even on BE machines, but returns the + * result in host endian format. Hence we need to byte swap it back to LE format + * so that it is consistent on disk. + */ +static inline __le32 +xfs_end_cksum(__uint32_t crc) +{ + return ~cpu_to_le32(crc); +} + +/* + * Helper to generate the checksum for a buffer. + */ +static inline void +xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); +} + +/* + * Helper to verify the checksum for a buffer. + */ +static inline int +xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) +{ + __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + + return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); +} + +#endif /* _XFS_CKSUM_H */ diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h deleted file mode 100644 index 5b7eb81453b..00000000000 --- a/fs/xfs/xfs_clnt.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CLNT_H__ -#define __XFS_CLNT_H__ - -/* - * XFS arguments structure, constructed from the arguments we - * are passed via the mount system call. - * - * NOTE: The mount system call is handled differently between - * Linux and IRIX. In IRIX we worked work with a binary data - * structure coming in across the syscall interface from user - * space (the mount userspace knows about each filesystem type - * and the set of valid options for it, and converts the users - * argument string into a binary structure _before_ making the - * system call), and the ABI issues that this implies. - * - * In Linux, we are passed a comma separated set of options; - * ie. a NULL terminated string of characters. Userspace mount - * code does not have any knowledge of mount options expected by - * each filesystem type and so each filesystem parses its mount - * options in kernel space. - * - * For the Linux port, we kept this structure pretty much intact - * and use it internally (because the existing code groks it). - */ -struct xfs_mount_args { - int flags; /* flags -> see XFSMNT_... macros below */ - int flags2; /* flags -> see XFSMNT2_... macros below */ - int logbufs; /* Number of log buffers, -1 to default */ - int logbufsize; /* Size of log buffers, -1 to default */ - char fsname[MAXNAMELEN+1]; /* data device name */ - char rtname[MAXNAMELEN+1]; /* realtime device filename */ - char logname[MAXNAMELEN+1]; /* journal device filename */ - char mtpt[MAXNAMELEN+1]; /* filesystem mount point */ - int sunit; /* stripe unit (BBs) */ - int swidth; /* stripe width (BBs), multiple of sunit */ - uchar_t iosizelog; /* log2 of the preferred I/O size */ - int ihashsize; /* inode hash table size (buckets) */ -}; - -/* - * XFS mount option flags -- args->flags1 - */ -#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */ -#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount - * compatible */ -#define XFSMNT_INO64 0x00000004 /* move inode numbers up - * past 2^32 */ -#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */ -#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */ -#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit - * enforcement */ -#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit - * enforcement */ -#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */ -#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at - * stripe boundaries*/ -#define XFSMNT_RETERR 0x00000400 /* return error to user */ -#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies - * read-only mount */ -#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */ -#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */ -#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */ - /* (osyncisdsync is default) */ -#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32 - * bits of address space */ -#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */ -#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit - * enforcement */ -#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ -#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ -#define XFSMNT_BARRIER 0x04000000 /* use write barriers */ -#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */ -#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width - * allocation */ -#define XFSMNT_IHASHSIZE 0x20000000 /* inode hash table size */ -#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename - * symlink,mkdir,rmdir,mknod */ -#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */ - -/* - * XFS mount option flags -- args->flags2 - */ -#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred - * I/O size in stat(2) */ - -#endif /* __XFS_CLNT_H__ */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 32ab61d17ac..a514ab61665 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,35 +18,29 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_alloc.h" -#include "xfs_btree.h" #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_attr_leaf.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_buf_item.h" /* * xfs_da_btree.c @@ -60,44 +55,237 @@ /* * Routines used for growing the Btree. */ -STATIC int xfs_da_root_split(xfs_da_state_t *state, +STATIC int xfs_da3_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_root, xfs_da_state_blk_t *new_child); -STATIC int xfs_da_node_split(xfs_da_state_t *state, +STATIC int xfs_da3_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *existing_blk, xfs_da_state_blk_t *split_blk, xfs_da_state_blk_t *blk_to_add, int treelevel, int *result); -STATIC void xfs_da_node_rebalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *node_blk_1, xfs_da_state_blk_t *node_blk_2); -STATIC void xfs_da_node_add(xfs_da_state_t *state, +STATIC void xfs_da3_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *old_node_blk, xfs_da_state_blk_t *new_node_blk); /* * Routines used for shrinking the Btree. */ -STATIC int xfs_da_root_join(xfs_da_state_t *state, +STATIC int xfs_da3_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk); -STATIC int xfs_da_node_toosmall(xfs_da_state_t *state, int *retval); -STATIC void xfs_da_node_remove(xfs_da_state_t *state, +STATIC int xfs_da3_node_toosmall(xfs_da_state_t *state, int *retval); +STATIC void xfs_da3_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk); -STATIC void xfs_da_node_unbalance(xfs_da_state_t *state, +STATIC void xfs_da3_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *src_node_blk, xfs_da_state_blk_t *dst_node_blk); /* * Utility routines. */ -STATIC uint xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count); -STATIC int xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp); -STATIC xfs_dabuf_t *xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra); -STATIC int xfs_da_blk_unlink(xfs_da_state_t *state, +STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, xfs_da_state_blk_t *save_blk); -STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); + + +kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ + +/* + * Allocate a dir-state structure. + * We don't put them on the stack since they're large. + */ +xfs_da_state_t * +xfs_da_state_alloc(void) +{ + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); +} + +/* + * Kill the altpath contents of a da-state structure. + */ +STATIC void +xfs_da_state_kill_altpath(xfs_da_state_t *state) +{ + int i; + + for (i = 0; i < state->altpath.active; i++) + state->altpath.blk[i].bp = NULL; + state->altpath.active = 0; +} + +/* + * Free a da-state structure. + */ +void +xfs_da_state_free(xfs_da_state_t *state) +{ + xfs_da_state_kill_altpath(state); +#ifdef DEBUG + memset((char *)state, 0, sizeof(*state)); +#endif /* DEBUG */ + kmem_zone_free(xfs_da_state_zone, state); +} + +static bool +xfs_da3_node_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_da_intnode *hdr = bp->b_addr; + struct xfs_da3_icnode_hdr ichdr; + const struct xfs_dir_ops *ops; + + ops = xfs_dir_get_ops(mp, NULL); + + ops->node_hdr_from_disk(&ichdr, hdr); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (ichdr.magic != XFS_DA3_NODE_MAGIC) + return false; + + if (!uuid_equal(&hdr3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) + return false; + } else { + if (ichdr.magic != XFS_DA_NODE_MAGIC) + return false; + } + if (ichdr.level == 0) + return false; + if (ichdr.level > XFS_DA_NODE_MAXDEPTH) + return false; + if (ichdr.count == 0) + return false; + + /* + * we don't know if the node is for and attribute or directory tree, + * so only fail if the count is outside both bounds + */ + if (ichdr.count > mp->m_dir_geo->node_ents && + ichdr.count > mp->m_attr_geo->node_ents) + return false; + + /* XXX: hash order check? */ + + return true; +} + +static void +xfs_da3_node_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF); +} + +/* + * leaf/node format detection on trees is sketchy, so a node read can be done on + * leaf level blocks when detection identifies the tree as a node format tree + * incorrectly. In this case, we need to swap the verifier to match the correct + * format of the block being read. + */ +static void +xfs_da3_node_read_verify( + struct xfs_buf *bp) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA3_NODE_MAGIC: + if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { + xfs_buf_ioerror(bp, EFSBADCRC); + break; + } + /* fall through */ + case XFS_DA_NODE_MAGIC: + if (!xfs_da3_node_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + break; + } + return; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + bp->b_ops = &xfs_attr3_leaf_buf_ops; + bp->b_ops->verify_read(bp); + return; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + bp->b_ops = &xfs_dir3_leafn_buf_ops; + bp->b_ops->verify_read(bp); + return; + default: + break; + } + + /* corrupt block */ + xfs_verifier_error(bp); +} + +const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .verify_read = xfs_da3_node_read_verify, + .verify_write = xfs_da3_node_write_verify, +}; + +int +xfs_da3_node_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int which_fork) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mappedbno, bpp, + which_fork, &xfs_da3_node_buf_ops); + if (!err && tp) { + struct xfs_da_blkinfo *info = (*bpp)->b_addr; + int type; + + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + type = XFS_BLFT_DA_NODE_BUF; + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + type = XFS_BLFT_ATTR_LEAF_BUF; + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + type = XFS_BLFT_DIR_LEAFN_BUF; + break; + default: + type = 0; + ASSERT(0); + break; + } + xfs_trans_buf_set_type(tp, *bpp, type); + } + return err; +} /*======================================================================== * Routines used for growing the Btree. @@ -107,29 +295,46 @@ STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state); * Create the initial contents of an intermediate node. */ int -xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork) +xfs_da3_node_create( + struct xfs_da_args *args, + xfs_dablk_t blkno, + int level, + struct xfs_buf **bpp, + int whichfork) { - xfs_da_intnode_t *node; - xfs_dabuf_t *bp; - int error; - xfs_trans_t *tp; - - tp = args->trans; - error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork); + struct xfs_da_intnode *node; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_da3_icnode_hdr ichdr = {0}; + struct xfs_buf *bp; + int error; + struct xfs_inode *dp = args->dp; + + trace_xfs_da_node_create(args); + ASSERT(level <= XFS_DA_NODE_MAXDEPTH); + + error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); if (error) return(error); - ASSERT(bp != NULL); - node = bp->data; - node->hdr.info.forw = 0; - node->hdr.info.back = 0; - node->hdr.info.magic = cpu_to_be16(XFS_DA_NODE_MAGIC); - node->hdr.info.pad = 0; - node->hdr.count = 0; - node->hdr.level = cpu_to_be16(level); - - xfs_da_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + bp->b_ops = &xfs_da3_node_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); + node = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + + ichdr.magic = XFS_DA3_NODE_MAGIC; + hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.owner = cpu_to_be64(args->dp->i_ino); + uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_uuid); + } else { + ichdr.magic = XFS_DA_NODE_MAGIC; + } + ichdr.level = level; + + dp->d_ops->node_hdr_to_disk(node, &ichdr); + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); *bpp = bp; return(0); @@ -140,12 +345,20 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, * intermediate nodes, rebalance, etc. */ int /* error */ -xfs_da_split(xfs_da_state_t *state) +xfs_da3_split( + struct xfs_da_state *state) { - xfs_da_state_blk_t *oldblk, *newblk, *addblk; - xfs_da_intnode_t *node; - xfs_dabuf_t *bp; - int max, action, error, i; + struct xfs_da_state_blk *oldblk; + struct xfs_da_state_blk *newblk; + struct xfs_da_state_blk *addblk; + struct xfs_da_intnode *node; + struct xfs_buf *bp; + int max; + int action = 0; + int error; + int i; + + trace_xfs_da_split(state->args); /* * Walk back up the tree splitting/inserting/adjusting as necessary. @@ -171,7 +384,7 @@ xfs_da_split(xfs_da_state_t *state) */ switch (oldblk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_split(state, oldblk, newblk); + error = xfs_attr3_leaf_split(state, oldblk, newblk); if ((error != 0) && (error != ENOSPC)) { return(error); /* GROT: attr is inconsistent */ } @@ -185,11 +398,13 @@ xfs_da_split(xfs_da_state_t *state) state->extravalid = 1; if (state->inleaf) { state->extraafter = 0; /* before newblk */ - error = xfs_attr_leaf_split(state, oldblk, + trace_xfs_attr_leaf_split_before(state->args); + error = xfs_attr3_leaf_split(state, oldblk, &state->extrablk); } else { state->extraafter = 1; /* after newblk */ - error = xfs_attr_leaf_split(state, newblk, + trace_xfs_attr_leaf_split_after(state->args); + error = xfs_attr3_leaf_split(state, newblk, &state->extrablk); } if (error) @@ -203,9 +418,8 @@ xfs_da_split(xfs_da_state_t *state) addblk = newblk; break; case XFS_DA_NODE_MAGIC: - error = xfs_da_node_split(state, oldblk, newblk, addblk, + error = xfs_da3_node_split(state, oldblk, newblk, addblk, max - i, &action); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; if (error) return(error); /* GROT: dir is inconsistent */ @@ -222,14 +436,7 @@ xfs_da_split(xfs_da_state_t *state) /* * Update the btree to show the new hashval for this child. */ - xfs_da_fixhashpath(state, &state->path); - /* - * If we won't need this block again, it's getting dropped - * from the active path by the loop control, so we need - * to mark it done now. - */ - if (i > 0 || !addblk) - xfs_da_buf_done(oldblk->bp); + xfs_da3_fixhashpath(state, &state->path); } if (!addblk) return(0); @@ -239,10 +446,8 @@ xfs_da_split(xfs_da_state_t *state) */ ASSERT(state->path.active == 0); oldblk = &state->path.blk[0]; - error = xfs_da_root_split(state, oldblk, addblk); + error = xfs_da3_root_split(state, oldblk, addblk); if (error) { - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(error); /* GROT: dir is inconsistent */ } @@ -252,9 +457,13 @@ xfs_da_split(xfs_da_state_t *state) * just got bumped because of the addition of a new root node. * There might be three blocks involved if a double split occurred, * and the original block 0 could be at any position in the list. + * + * Note: the magic numbers and sibling pointers are in the same + * physical place for both v2 and v3 headers (by design). Hence it + * doesn't matter which version of the xfs_da_intnode structure we use + * here as the result will be the same using either structure. */ - - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.forw) { if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { bp = addblk->bp; @@ -262,13 +471,13 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.back = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - node = oldblk->bp->data; + node = oldblk->bp->b_addr; if (node->hdr.info.back) { if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { bp = addblk->bp; @@ -276,14 +485,12 @@ xfs_da_split(xfs_da_state_t *state) ASSERT(state->extravalid); bp = state->extrablk.bp; } - node = bp->data; + node = bp->b_addr; node->hdr.info.forw = cpu_to_be32(oldblk->blkno); - xfs_da_log_buf(state->args->trans, bp, + xfs_trans_log_buf(state->args->trans, bp, XFS_DA_LOGRANGE(node, &node->hdr.info, sizeof(node->hdr.info))); } - xfs_da_buf_done(oldblk->bp); - xfs_da_buf_done(addblk->bp); addblk->bp = NULL; return(0); } @@ -294,101 +501,159 @@ xfs_da_split(xfs_da_state_t *state) * the EOF, extending the inode in process. */ STATIC int /* error */ -xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_root_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node, *oldroot; - xfs_da_args_t *args; - xfs_dablk_t blkno; - xfs_dabuf_t *bp; - int error, size; - xfs_inode_t *dp; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dir2_leaf_t *leaf; + struct xfs_da_intnode *node; + struct xfs_da_intnode *oldroot; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + struct xfs_buf *bp; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_dir2_leaf *leaf; + xfs_dablk_t blkno; + int level; + int error; + int size; + + trace_xfs_da_root_split(state->args); /* * Copy the existing (incorrect) block from the root node position * to a free space somewhere. */ args = state->args; - ASSERT(args != NULL); error = xfs_da_grow_inode(args, &blkno); if (error) - return(error); + return error; + dp = args->dp; tp = args->trans; mp = state->mp; error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - node = bp->data; - oldroot = blk1->bp->data; - if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC) { - size = (int)((char *)&oldroot->btree[be16_to_cpu(oldroot->hdr.count)] - - (char *)oldroot); + return error; + node = bp->b_addr; + oldroot = blk1->bp->b_addr; + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { + struct xfs_da3_icnode_hdr nodehdr; + + dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + btree = dp->d_ops->node_tree_p(oldroot); + size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); + level = nodehdr.level; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); } else { - ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + leaf = (xfs_dir2_leaf_t *)oldroot; - size = (int)((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] - - (char *)leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + size = (int)((char *)&ents[leafhdr.count] - (char *)leaf); + level = 0; + + /* + * we are about to copy oldroot to bp, so set up the type + * of bp while we know exactly what it will be. + */ + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); } + + /* + * we can copy most of the information in the node from one block to + * another, but for CRC enabled headers we have to make sure that the + * block specific identifiers are kept intact. We update the buffer + * directly for this. + */ memcpy(node, oldroot, size); - xfs_da_log_buf(tp, bp, 0, size - 1); - xfs_da_buf_done(blk1->bp); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; + + node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + } + xfs_trans_log_buf(tp, bp, 0, size - 1); + + bp->b_ops = blk1->bp->b_ops; + xfs_trans_buf_copy_type(bp, blk1->bp); blk1->bp = bp; blk1->blkno = blkno; /* * Set up the new root node. */ - error = xfs_da_node_create(args, - (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0, - be16_to_cpu(node->hdr.level) + 1, &bp, args->whichfork); + error = xfs_da3_node_create(args, + (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0, + level + 1, &bp, args->whichfork); if (error) - return(error); - node = bp->data; - node->btree[0].hashval = cpu_to_be32(blk1->hashval); - node->btree[0].before = cpu_to_be32(blk1->blkno); - node->btree[1].hashval = cpu_to_be32(blk2->hashval); - node->btree[1].before = cpu_to_be32(blk2->blkno); - node->hdr.count = cpu_to_be16(2); + return error; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + btree[0].hashval = cpu_to_be32(blk1->hashval); + btree[0].before = cpu_to_be32(blk1->blkno); + btree[1].hashval = cpu_to_be32(blk2->hashval); + btree[1].before = cpu_to_be32(blk2->blkno); + nodehdr.count = 2; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); #ifdef DEBUG - if (be16_to_cpu(oldroot->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC) { - ASSERT(blk1->blkno >= mp->m_dirleafblk && - blk1->blkno < mp->m_dirfreeblk); - ASSERT(blk2->blkno >= mp->m_dirleafblk && - blk2->blkno < mp->m_dirfreeblk); + if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + ASSERT(blk1->blkno >= args->geo->leafblk && + blk1->blkno < args->geo->freeblk); + ASSERT(blk2->blkno >= args->geo->leafblk && + blk2->blkno < args->geo->freeblk); } #endif /* Header is already logged by xfs_da_node_create */ - xfs_da_log_buf(tp, bp, - XFS_DA_LOGRANGE(node, node->btree, - sizeof(xfs_da_node_entry_t) * 2)); - xfs_da_buf_done(bp); + xfs_trans_log_buf(tp, bp, + XFS_DA_LOGRANGE(node, btree, sizeof(xfs_da_node_entry_t) * 2)); - return(0); + return 0; } /* * Split the node, rebalance, then add the new entry. */ STATIC int /* error */ -xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk, - xfs_da_state_blk_t *addblk, - int treelevel, int *result) +xfs_da3_node_split( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk, + struct xfs_da_state_blk *addblk, + int treelevel, + int *result) { - xfs_da_intnode_t *node; - xfs_dablk_t blkno; - int newcount, error; - int useextra; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno; + int newcount; + int error; + int useextra; + struct xfs_inode *dp = state->args->dp; - node = oldblk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + trace_xfs_da_node_split(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); /* * With V2 dirs the extra block is data or freespace. @@ -398,7 +663,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, /* * Do we have to split the node? */ - if ((be16_to_cpu(node->hdr.count) + newcount) > state->node_ents) { + if (nodehdr.count + newcount > state->args->geo->node_ents) { /* * Allocate a new node, add to the doubly linked chain of * nodes, then move some of our excess entries into it. @@ -407,14 +672,14 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, if (error) return(error); /* GROT: dir is inconsistent */ - error = xfs_da_node_create(state->args, blkno, treelevel, + error = xfs_da3_node_create(state->args, blkno, treelevel, &newblk->bp, state->args->whichfork); if (error) return(error); /* GROT: dir is inconsistent */ newblk->blkno = blkno; newblk->magic = XFS_DA_NODE_MAGIC; - xfs_da_node_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + xfs_da3_node_rebalance(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) return(error); *result = 1; @@ -426,7 +691,7 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * Insert the new entry(s) into the correct block * (updating last hashval in the process). * - * xfs_da_node_add() inserts BEFORE the given index, + * xfs_da3_node_add() inserts BEFORE the given index, * and as a result of using node_lookup_int() we always * point to a valid entry (not after one), but a split * operation always results in a new block whose hashvals @@ -434,23 +699,24 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * * If we had double-split op below us, then add the extra block too. */ - node = oldblk->bp->data; - if (oldblk->index <= be16_to_cpu(node->hdr.count)) { + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (oldblk->index <= nodehdr.count) { oldblk->index++; - xfs_da_node_add(state, oldblk, addblk); + xfs_da3_node_add(state, oldblk, addblk); if (useextra) { if (state->extraafter) oldblk->index++; - xfs_da_node_add(state, oldblk, &state->extrablk); + xfs_da3_node_add(state, oldblk, &state->extrablk); state->extravalid = 0; } } else { newblk->index++; - xfs_da_node_add(state, newblk, addblk); + xfs_da3_node_add(state, newblk, addblk); if (useextra) { if (state->extraafter) newblk->index++; - xfs_da_node_add(state, newblk, &state->extrablk); + xfs_da3_node_add(state, newblk, &state->extrablk); state->extravalid = 0; } } @@ -465,31 +731,54 @@ xfs_da_node_split(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * NOTE: if blk2 is empty, then it will get the upper half of blk1. */ STATIC void -xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, - xfs_da_state_blk_t *blk2) +xfs_da3_node_rebalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *blk1, + struct xfs_da_state_blk *blk2) { - xfs_da_intnode_t *node1, *node2, *tmpnode; - xfs_da_node_entry_t *btree_s, *btree_d; - int count, tmp; - xfs_trans_t *tp; + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_intnode *tmpnode; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da_node_entry *btree_s; + struct xfs_da_node_entry *btree_d; + struct xfs_da3_icnode_hdr nodehdr1; + struct xfs_da3_icnode_hdr nodehdr2; + struct xfs_trans *tp; + int count; + int tmp; + int swap = 0; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_rebalance(state->args); + + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); - node1 = blk1->bp->data; - node2 = blk2->bp->data; /* * Figure out how many entries need to move, and in which direction. * Swap the nodes around if that makes it simpler. */ - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { + if (nodehdr1.count > 0 && nodehdr2.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[nodehdr2.count - 1].hashval) < + be32_to_cpu(btree1[nodehdr1.count - 1].hashval)))) { tmpnode = node1; node1 = node2; node2 = tmpnode; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + swap = 1; } - ASSERT(be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT(be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC); - count = (be16_to_cpu(node1->hdr.count) - be16_to_cpu(node2->hdr.count)) / 2; + + count = (nodehdr1.count - nodehdr2.count) / 2; if (count == 0) return; tp = state->args->trans; @@ -500,10 +789,11 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, /* * Move elements in node2 up to make a hole. */ - if ((tmp = be16_to_cpu(node2->hdr.count)) > 0) { + tmp = nodehdr2.count; + if (tmp > 0) { tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node2->btree[count]; + btree_s = &btree2[0]; + btree_d = &btree2[count]; memmove(btree_d, btree_s, tmp); } @@ -511,12 +801,12 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Move the req'd B-tree elements from high in node1 to * low in node2. */ - be16_add(&node2->hdr.count, count); + nodehdr2.count += count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node1->btree[be16_to_cpu(node1->hdr.count) - count]; - btree_d = &node2->btree[0]; + btree_s = &btree1[nodehdr1.count - count]; + btree_d = &btree2[0]; memcpy(btree_d, btree_s, tmp); - be16_add(&node1->hdr.count, -count); + nodehdr1.count -= count; } else { /* * Move the req'd B-tree elements from low in node2 to @@ -524,49 +814,59 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, */ count = -count; tmp = count * (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[0]; - btree_d = &node1->btree[be16_to_cpu(node1->hdr.count)]; + btree_s = &btree2[0]; + btree_d = &btree1[nodehdr1.count]; memcpy(btree_d, btree_s, tmp); - be16_add(&node1->hdr.count, count); - xfs_da_log_buf(tp, blk1->bp, + nodehdr1.count += count; + + xfs_trans_log_buf(tp, blk1->bp, XFS_DA_LOGRANGE(node1, btree_d, tmp)); /* * Move elements in node2 down to fill the hole. */ - tmp = be16_to_cpu(node2->hdr.count) - count; + tmp = nodehdr2.count - count; tmp *= (uint)sizeof(xfs_da_node_entry_t); - btree_s = &node2->btree[count]; - btree_d = &node2->btree[0]; + btree_s = &btree2[count]; + btree_d = &btree2[0]; memmove(btree_d, btree_s, tmp); - be16_add(&node2->hdr.count, -count); + nodehdr2.count -= count; } /* * Log header of node 1 and all current bits of node 2. */ - xfs_da_log_buf(tp, blk1->bp, - XFS_DA_LOGRANGE(node1, &node1->hdr, sizeof(node1->hdr))); - xfs_da_log_buf(tp, blk2->bp, + dp->d_ops->node_hdr_to_disk(node1, &nodehdr1); + xfs_trans_log_buf(tp, blk1->bp, + XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size)); + + dp->d_ops->node_hdr_to_disk(node2, &nodehdr2); + xfs_trans_log_buf(tp, blk2->bp, XFS_DA_LOGRANGE(node2, &node2->hdr, - sizeof(node2->hdr) + - sizeof(node2->btree[0]) * be16_to_cpu(node2->hdr.count))); + dp->d_ops->node_hdr_size + + (sizeof(btree2[0]) * nodehdr2.count))); /* * Record the last hashval from each block for upward propagation. * (note: don't use the swapped node pointers) */ - node1 = blk1->bp->data; - node2 = blk2->bp->data; - blk1->hashval = be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval); - blk2->hashval = be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval); + if (swap) { + node1 = blk1->bp->b_addr; + node2 = blk2->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr1, node1); + dp->d_ops->node_hdr_from_disk(&nodehdr2, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + } + blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval); + blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval); /* * Adjust the expected index for insertion. */ - if (blk1->index >= be16_to_cpu(node1->hdr.count)) { - blk2->index = blk1->index - be16_to_cpu(node1->hdr.count); - blk1->index = be16_to_cpu(node1->hdr.count) + 1; /* make it invalid */ + if (blk1->index >= nodehdr1.count) { + blk2->index = blk1->index - nodehdr1.count; + blk1->index = nodehdr1.count + 1; /* make it invalid */ } } @@ -574,44 +874,52 @@ xfs_da_node_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, * Add a new entry to an intermediate node. */ STATIC void -xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, - xfs_da_state_blk_t *newblk) +xfs_da3_node_add( + struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_mount_t *mp; + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int tmp; + struct xfs_inode *dp = state->args->dp; - node = oldblk->bp->data; - mp = state->mp; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT((oldblk->index >= 0) && (oldblk->index <= be16_to_cpu(node->hdr.count))); + trace_xfs_da_node_add(state->args); + + node = oldblk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count); ASSERT(newblk->blkno != 0); if (state->args->whichfork == XFS_DATA_FORK) - ASSERT(newblk->blkno >= mp->m_dirleafblk && - newblk->blkno < mp->m_dirfreeblk); + ASSERT(newblk->blkno >= state->args->geo->leafblk && + newblk->blkno < state->args->geo->freeblk); /* * We may need to make some room before we insert the new node. */ tmp = 0; - btree = &node->btree[ oldblk->index ]; - if (oldblk->index < be16_to_cpu(node->hdr.count)) { - tmp = (be16_to_cpu(node->hdr.count) - oldblk->index) * (uint)sizeof(*btree); - memmove(btree + 1, btree, tmp); + if (oldblk->index < nodehdr.count) { + tmp = (nodehdr.count - oldblk->index) * (uint)sizeof(*btree); + memmove(&btree[oldblk->index + 1], &btree[oldblk->index], tmp); } - btree->hashval = cpu_to_be32(newblk->hashval); - btree->before = cpu_to_be32(newblk->blkno); - xfs_da_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, btree, tmp + sizeof(*btree))); - be16_add(&node->hdr.count, 1); - xfs_da_log_buf(state->args->trans, oldblk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + btree[oldblk->index].hashval = cpu_to_be32(newblk->hashval); + btree[oldblk->index].before = cpu_to_be32(newblk->blkno); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &btree[oldblk->index], + tmp + sizeof(*btree))); + + nodehdr.count += 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, oldblk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the oldblk to propagate upwards. */ - oldblk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1 ].hashval); + oldblk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); } /*======================================================================== @@ -623,12 +931,16 @@ xfs_da_node_add(xfs_da_state_t *state, xfs_da_state_blk_t *oldblk, * possibly deallocating that block, etc... */ int -xfs_da_join(xfs_da_state_t *state) +xfs_da3_join( + struct xfs_da_state *state) { - xfs_da_state_blk_t *drop_blk, *save_blk; - int action, error; + struct xfs_da_state_blk *drop_blk; + struct xfs_da_state_blk *save_blk; + int action = 0; + int error; + + trace_xfs_da_join(state->args); - action = 0; drop_blk = &state->path.blk[ state->path.active-1 ]; save_blk = &state->altpath.blk[ state->path.active-1 ]; ASSERT(state->path.blk[0].magic == XFS_DA_NODE_MAGIC); @@ -649,12 +961,12 @@ xfs_da_join(xfs_da_state_t *state) */ switch (drop_blk->magic) { case XFS_ATTR_LEAF_MAGIC: - error = xfs_attr_leaf_toosmall(state, &action); + error = xfs_attr3_leaf_toosmall(state, &action); if (error) return(error); if (action == 0) return(0); - xfs_attr_leaf_unbalance(state, drop_blk, save_blk); + xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); break; case XFS_DIR2_LEAFN_MAGIC: error = xfs_dir2_leafn_toosmall(state, &action); @@ -669,18 +981,18 @@ xfs_da_join(xfs_da_state_t *state) * Remove the offending node, fixup hashvals, * check for a toosmall neighbor. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_node_toosmall(state, &action); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_node_toosmall(state, &action); if (error) return(error); if (action == 0) return 0; - xfs_da_node_unbalance(state, drop_blk, save_blk); + xfs_da3_node_unbalance(state, drop_blk, save_blk); break; } - xfs_da_fixhashpath(state, &state->altpath); - error = xfs_da_blk_unlink(state, drop_blk, save_blk); + xfs_da3_fixhashpath(state, &state->altpath); + error = xfs_da3_blk_unlink(state, drop_blk, save_blk); xfs_da_state_kill_altpath(state); if (error) return(error); @@ -695,63 +1007,97 @@ xfs_da_join(xfs_da_state_t *state) * we only have one entry in the root, make the child block * the new root. */ - xfs_da_node_remove(state, drop_blk); - xfs_da_fixhashpath(state, &state->path); - error = xfs_da_root_join(state, &state->path.blk[0]); + xfs_da3_node_remove(state, drop_blk); + xfs_da3_fixhashpath(state, &state->path); + error = xfs_da3_root_join(state, &state->path.blk[0]); return(error); } +#ifdef DEBUG +static void +xfs_da_blkinfo_onlychild_validate(struct xfs_da_blkinfo *blkinfo, __u16 level) +{ + __be16 magic = blkinfo->magic; + + if (level == 1) { + ASSERT(magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + } else { + ASSERT(magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + } + ASSERT(!blkinfo->forw); + ASSERT(!blkinfo->back); +} +#else /* !DEBUG */ +#define xfs_da_blkinfo_onlychild_validate(blkinfo, level) +#endif /* !DEBUG */ + /* * We have only one entry in the root. Copy the only remaining child of * the old root to block 0 as the new root node. */ STATIC int -xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) +xfs_da3_root_join( + struct xfs_da_state *state, + struct xfs_da_state_blk *root_blk) { - xfs_da_intnode_t *oldroot; - /* REFERENCED */ - xfs_da_blkinfo_t *blkinfo; - xfs_da_args_t *args; - xfs_dablk_t child; - xfs_dabuf_t *bp; - int error; + struct xfs_da_intnode *oldroot; + struct xfs_da_args *args; + xfs_dablk_t child; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr oldroothdr; + struct xfs_da_node_entry *btree; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_root_join(state->args); - args = state->args; - ASSERT(args != NULL); ASSERT(root_blk->magic == XFS_DA_NODE_MAGIC); - oldroot = root_blk->bp->data; - ASSERT(be16_to_cpu(oldroot->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT(!oldroot->hdr.info.forw); - ASSERT(!oldroot->hdr.info.back); + + args = state->args; + oldroot = root_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot); + ASSERT(oldroothdr.forw == 0); + ASSERT(oldroothdr.back == 0); /* * If the root has more than one child, then don't do anything. */ - if (be16_to_cpu(oldroot->hdr.count) > 1) - return(0); + if (oldroothdr.count > 1) + return 0; /* * Read in the (only) child block, then copy those bytes into * the root block's buffer and free the original child block. */ - child = be32_to_cpu(oldroot->btree[0].before); + btree = dp->d_ops->node_tree_p(oldroot); + child = be32_to_cpu(btree[0].before); ASSERT(child != 0); - error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp, + error = xfs_da3_node_read(args->trans, dp, child, -1, &bp, args->whichfork); if (error) - return(error); - ASSERT(bp != NULL); - blkinfo = bp->data; - if (be16_to_cpu(oldroot->hdr.level) == 1) { - ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(blkinfo->magic) == XFS_ATTR_LEAF_MAGIC); - } else { - ASSERT(be16_to_cpu(blkinfo->magic) == XFS_DA_NODE_MAGIC); - } - ASSERT(!blkinfo->forw); - ASSERT(!blkinfo->back); - memcpy(root_blk->bp->data, bp->data, state->blocksize); - xfs_da_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1); + return error; + xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level); + + /* + * This could be copying a leaf back into the root block in the case of + * there only being a single leaf block left in the tree. Hence we have + * to update the b_ops pointer as well to match the buffer type change + * that could occur. For dir3 blocks we also need to update the block + * number in the buffer header. + */ + memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize); + root_blk->bp->b_ops = bp->b_ops; + xfs_trans_buf_copy_type(root_blk->bp, bp); + if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { + struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; + da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + } + xfs_trans_log_buf(args->trans, root_blk->bp, 0, + args->geo->blksize - 1); error = xfs_da_shrink_inode(args, child, bp); return(error); } @@ -766,14 +1112,24 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk) * If nothing can be done, return 0. */ STATIC int -xfs_da_node_toosmall(xfs_da_state_t *state, int *action) +xfs_da3_node_toosmall( + struct xfs_da_state *state, + int *action) { - xfs_da_intnode_t *node; - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - int count, forward, error, retval, i; - xfs_dablk_t blkno; - xfs_dabuf_t *bp; + struct xfs_da_intnode *node; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + xfs_dablk_t blkno; + struct xfs_buf *bp; + struct xfs_da3_icnode_hdr nodehdr; + int count; + int forward; + int error; + int retval; + int i; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_toosmall(state->args); /* * Check for the degenerate case of the block being over 50% full. @@ -781,11 +1137,10 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * to coalesce with a sibling. */ blk = &state->path.blk[ state->path.active-1 ]; - info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC); + info = blk->bp->b_addr; node = (xfs_da_intnode_t *)info; - count = be16_to_cpu(node->hdr.count); - if (count > (state->node_ents >> 1)) { + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (nodehdr.count > (state->args->geo->node_ents >> 1)) { *action = 0; /* blk over 50%, don't try to join */ return(0); /* blk over 50%, don't try to join */ } @@ -796,14 +1151,14 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ - if (count == 0) { + if (nodehdr.count == 0) { /* * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ forward = (info->forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); if (error) return(error); @@ -822,35 +1177,35 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ + count = state->args->geo->node_ents; + count -= state->args->geo->node_ents >> 2; + count -= nodehdr.count; + /* start with smaller blk num */ - forward = (be32_to_cpu(info->forw) < be32_to_cpu(info->back)); + forward = nodehdr.forw < nodehdr.back; for (i = 0; i < 2; forward = !forward, i++) { + struct xfs_da3_icnode_hdr thdr; if (forward) - blkno = be32_to_cpu(info->forw); + blkno = nodehdr.forw; else - blkno = be32_to_cpu(info->back); + blkno = nodehdr.back; if (blkno == 0) continue; - error = xfs_da_read_buf(state->args->trans, state->args->dp, + error = xfs_da3_node_read(state->args->trans, dp, blkno, -1, &bp, state->args->whichfork); if (error) return(error); - ASSERT(bp != NULL); - - node = (xfs_da_intnode_t *)info; - count = state->node_ents; - count -= state->node_ents >> 2; - count -= be16_to_cpu(node->hdr.count); - node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - count -= be16_to_cpu(node->hdr.count); - xfs_da_brelse(state->args->trans, bp); - if (count >= 0) + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&thdr, node); + xfs_trans_brelse(state->args->trans, bp); + + if (count - thdr.count >= 0) break; /* fits with at least 25% to spare */ } if (i >= 2) { *action = 0; - return(0); + return 0; } /* @@ -859,28 +1214,43 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) { - error = xfs_da_path_shift(state, &state->altpath, forward, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } } else { - error = xfs_da_path_shift(state, &state->path, forward, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &retval); - if (error) { - return(error); - } - if (retval) { - *action = 0; - return(0); - } + } + if (error) + return error; + if (retval) { + *action = 0; + return 0; } *action = 1; - return(0); + return 0; +} + +/* + * Pick up the last hashvalue from an intermediate node. + */ +STATIC uint +xfs_da3_node_lasthash( + struct xfs_inode *dp, + struct xfs_buf *bp, + int *count) +{ + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + + node = bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + if (count) + *count = nodehdr.count; + if (!nodehdr.count) + return 0; + btree = dp->d_ops->node_tree_p(node); + return be32_to_cpu(btree[nodehdr.count - 1].hashval); } /* @@ -888,13 +1258,19 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) * when we stop making changes, return. */ void -xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) +xfs_da3_fixhashpath( + struct xfs_da_state *state, + struct xfs_da_state_path *path) { - xfs_da_state_blk_t *blk; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dahash_t lasthash=0; - int level, count; + struct xfs_da_state_blk *blk; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + xfs_dahash_t lasthash=0; + int level; + int count; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_fixhashpath(state->args); level = path->active-1; blk = &path->blk[ level ]; @@ -905,28 +1281,31 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) return; break; case XFS_DIR2_LEAFN_MAGIC: - lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count); + lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count); if (count == 0) return; break; case XFS_DA_NODE_MAGIC: - lasthash = xfs_da_node_lasthash(blk->bp, &count); + lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count); if (count == 0) return; break; } for (blk--, level--; level >= 0; blk--, level--) { - node = blk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - btree = &node->btree[ blk->index ]; - if (be32_to_cpu(btree->hashval) == lasthash) + struct xfs_da3_icnode_hdr nodehdr; + + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + if (be32_to_cpu(btree[blk->index].hashval) == lasthash) break; blk->hashval = lasthash; - btree->hashval = cpu_to_be32(lasthash); - xfs_da_log_buf(state->args->trans, blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); + btree[blk->index].hashval = cpu_to_be32(lasthash); + xfs_trans_log_buf(state->args->trans, blk->bp, + XFS_DA_LOGRANGE(node, &btree[blk->index], + sizeof(*btree))); - lasthash = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + lasthash = be32_to_cpu(btree[nodehdr.count - 1].hashval); } } @@ -934,100 +1313,122 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path) * Remove an entry from an intermediate node. */ STATIC void -xfs_da_node_remove(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk) +xfs_da3_node_remove( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk) { - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - int tmp; - - node = drop_blk->bp->data; - ASSERT(drop_blk->index < be16_to_cpu(node->hdr.count)); + struct xfs_da_intnode *node; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_node_entry *btree; + int index; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_remove(state->args); + + node = drop_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + ASSERT(drop_blk->index < nodehdr.count); ASSERT(drop_blk->index >= 0); /* * Copy over the offending entry, or just zero it out. */ - btree = &node->btree[drop_blk->index]; - if (drop_blk->index < (be16_to_cpu(node->hdr.count)-1)) { - tmp = be16_to_cpu(node->hdr.count) - drop_blk->index - 1; + index = drop_blk->index; + btree = dp->d_ops->node_tree_p(node); + if (index < nodehdr.count - 1) { + tmp = nodehdr.count - index - 1; tmp *= (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, btree + 1, tmp); - xfs_da_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, tmp)); - btree = &node->btree[be16_to_cpu(node->hdr.count)-1]; - } - memset((char *)btree, 0, sizeof(xfs_da_node_entry_t)); - xfs_da_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, btree, sizeof(*btree))); - be16_add(&node->hdr.count, -1); - xfs_da_log_buf(state->args->trans, drop_blk->bp, - XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr))); + memmove(&btree[index], &btree[index + 1], tmp); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], tmp)); + index = nodehdr.count - 1; + } + memset(&btree[index], 0, sizeof(xfs_da_node_entry_t)); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index]))); + nodehdr.count -= 1; + dp->d_ops->node_hdr_to_disk(node, &nodehdr); + xfs_trans_log_buf(state->args->trans, drop_blk->bp, + XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); /* * Copy the last hash value from the block to propagate upwards. */ - btree--; - drop_blk->hashval = be32_to_cpu(btree->hashval); + drop_blk->hashval = be32_to_cpu(btree[index - 1].hashval); } /* - * Unbalance the btree elements between two intermediate nodes, + * Unbalance the elements between two intermediate nodes, * move all Btree elements from one node into another. */ STATIC void -xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_node_unbalance( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_intnode_t *drop_node, *save_node; - xfs_da_node_entry_t *btree; - int tmp; - xfs_trans_t *tp; - - drop_node = drop_blk->bp->data; - save_node = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - ASSERT(be16_to_cpu(save_node->hdr.info.magic) == XFS_DA_NODE_MAGIC); + struct xfs_da_intnode *drop_node; + struct xfs_da_intnode *save_node; + struct xfs_da_node_entry *drop_btree; + struct xfs_da_node_entry *save_btree; + struct xfs_da3_icnode_hdr drop_hdr; + struct xfs_da3_icnode_hdr save_hdr; + struct xfs_trans *tp; + int sindex; + int tmp; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_node_unbalance(state->args); + + drop_node = drop_blk->bp->b_addr; + save_node = save_blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node); + dp->d_ops->node_hdr_from_disk(&save_hdr, save_node); + drop_btree = dp->d_ops->node_tree_p(drop_node); + save_btree = dp->d_ops->node_tree_p(save_node); tp = state->args->trans; /* * If the dying block has lower hashvals, then move all the * elements in the remaining block up to make a hole. */ - if ((be32_to_cpu(drop_node->btree[0].hashval) < be32_to_cpu(save_node->btree[ 0 ].hashval)) || - (be32_to_cpu(drop_node->btree[be16_to_cpu(drop_node->hdr.count)-1].hashval) < - be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval))) - { - btree = &save_node->btree[be16_to_cpu(drop_node->hdr.count)]; - tmp = be16_to_cpu(save_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memmove(btree, &save_node->btree[0], tmp); - btree = &save_node->btree[0]; - xfs_da_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - (be16_to_cpu(save_node->hdr.count) + be16_to_cpu(drop_node->hdr.count)) * - sizeof(xfs_da_node_entry_t))); + if ((be32_to_cpu(drop_btree[0].hashval) < + be32_to_cpu(save_btree[0].hashval)) || + (be32_to_cpu(drop_btree[drop_hdr.count - 1].hashval) < + be32_to_cpu(save_btree[save_hdr.count - 1].hashval))) { + /* XXX: check this - is memmove dst correct? */ + tmp = save_hdr.count * sizeof(xfs_da_node_entry_t); + memmove(&save_btree[drop_hdr.count], &save_btree[0], tmp); + + sindex = 0; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[0], + (save_hdr.count + drop_hdr.count) * + sizeof(xfs_da_node_entry_t))); } else { - btree = &save_node->btree[be16_to_cpu(save_node->hdr.count)]; - xfs_da_log_buf(tp, save_blk->bp, - XFS_DA_LOGRANGE(save_node, btree, - be16_to_cpu(drop_node->hdr.count) * - sizeof(xfs_da_node_entry_t))); + sindex = save_hdr.count; + xfs_trans_log_buf(tp, save_blk->bp, + XFS_DA_LOGRANGE(save_node, &save_btree[sindex], + drop_hdr.count * sizeof(xfs_da_node_entry_t))); } /* * Move all the B-tree elements from drop_blk to save_blk. */ - tmp = be16_to_cpu(drop_node->hdr.count) * (uint)sizeof(xfs_da_node_entry_t); - memcpy(btree, &drop_node->btree[0], tmp); - be16_add(&save_node->hdr.count, be16_to_cpu(drop_node->hdr.count)); + tmp = drop_hdr.count * (uint)sizeof(xfs_da_node_entry_t); + memcpy(&save_btree[sindex], &drop_btree[0], tmp); + save_hdr.count += drop_hdr.count; - xfs_da_log_buf(tp, save_blk->bp, + dp->d_ops->node_hdr_to_disk(save_node, &save_hdr); + xfs_trans_log_buf(tp, save_blk->bp, XFS_DA_LOGRANGE(save_node, &save_node->hdr, - sizeof(save_node->hdr))); + dp->d_ops->node_hdr_size)); /* * Save the last hashval in the remaining block for upward propagation. */ - save_blk->hashval = be32_to_cpu(save_node->btree[be16_to_cpu(save_node->hdr.count)-1].hashval); + save_blk->hashval = be32_to_cpu(save_btree[save_hdr.count - 1].hashval); } /*======================================================================== @@ -1046,16 +1447,25 @@ xfs_da_node_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * pruned depth-first tree search. */ int /* error */ -xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) +xfs_da3_node_lookup_int( + struct xfs_da_state *state, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *curr; - xfs_da_intnode_t *node; - xfs_da_node_entry_t *btree; - xfs_dablk_t blkno; - int probe, span, max, error, retval; - xfs_dahash_t hashval; - xfs_da_args_t *args; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *curr; + struct xfs_da_intnode *node; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da_args *args; + xfs_dablk_t blkno; + xfs_dahash_t hashval; + xfs_dahash_t btreehashval; + int probe; + int span; + int max; + int error; + int retval; + struct xfs_inode *dp = state->args->dp; args = state->args; @@ -1063,7 +1473,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Descend thru the B-tree searching each level for the right * node to use, until the right hashval is found. */ - blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0; + blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0; for (blk = &state->path.blk[0], state->path.active = 1; state->path.active <= XFS_DA_NODE_MAXDEPTH; blk++, state->path.active++) { @@ -1071,74 +1481,85 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) * Read the next node down in the tree. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, + error = xfs_da3_node_read(args->trans, args->dp, blkno, -1, &blk->bp, args->whichfork); if (error) { blk->blkno = 0; state->path.active--; return(error); } - curr = blk->bp->data; - ASSERT(be16_to_cpu(curr->magic) == XFS_DA_NODE_MAGIC || - be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC); + curr = blk->bp->b_addr; + blk->magic = be16_to_cpu(curr->magic); + + if (blk->magic == XFS_ATTR_LEAF_MAGIC || + blk->magic == XFS_ATTR3_LEAF_MAGIC) { + blk->magic = XFS_ATTR_LEAF_MAGIC; + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + } + + if (blk->magic == XFS_DIR2_LEAFN_MAGIC || + blk->magic == XFS_DIR3_LEAFN_MAGIC) { + blk->magic = XFS_DIR2_LEAFN_MAGIC; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + } + + blk->magic = XFS_DA_NODE_MAGIC; + /* * Search an intermediate node for a match. */ - blk->magic = be16_to_cpu(curr->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { - node = blk->bp->data; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); - /* - * Binary search. (note: small blocks will skip loop) - */ - max = be16_to_cpu(node->hdr.count); - probe = span = max / 2; - hashval = args->hashval; - for (btree = &node->btree[probe]; span > 4; - btree = &node->btree[probe]) { - span /= 2; - if (be32_to_cpu(btree->hashval) < hashval) - probe += span; - else if (be32_to_cpu(btree->hashval) > hashval) - probe -= span; - else - break; - } - ASSERT((probe >= 0) && (probe < max)); - ASSERT((span <= 4) || (be32_to_cpu(btree->hashval) == hashval)); + max = nodehdr.count; + blk->hashval = be32_to_cpu(btree[max - 1].hashval); - /* - * Since we may have duplicate hashval's, find the first - * matching hashval in the node. - */ - while ((probe > 0) && (be32_to_cpu(btree->hashval) >= hashval)) { - btree--; - probe--; - } - while ((probe < max) && (be32_to_cpu(btree->hashval) < hashval)) { - btree++; - probe++; - } + /* + * Binary search. (note: small blocks will skip loop) + */ + probe = span = max / 2; + hashval = args->hashval; + while (span > 4) { + span /= 2; + btreehashval = be32_to_cpu(btree[probe].hashval); + if (btreehashval < hashval) + probe += span; + else if (btreehashval > hashval) + probe -= span; + else + break; + } + ASSERT((probe >= 0) && (probe < max)); + ASSERT((span <= 4) || + (be32_to_cpu(btree[probe].hashval) == hashval)); - /* - * Pick the right block to descend on. - */ - if (probe == max) { - blk->index = max-1; - blkno = be32_to_cpu(node->btree[max-1].before); - } else { - blk->index = probe; - blkno = be32_to_cpu(btree->before); - } - } else if (be16_to_cpu(curr->magic) == XFS_ATTR_LEAF_MAGIC) { - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); - break; - } else if (be16_to_cpu(curr->magic) == XFS_DIR2_LEAFN_MAGIC) { - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL); - break; + /* + * Since we may have duplicate hashval's, find the first + * matching hashval in the node. + */ + while (probe > 0 && + be32_to_cpu(btree[probe].hashval) >= hashval) { + probe--; + } + while (probe < max && + be32_to_cpu(btree[probe].hashval) < hashval) { + probe++; + } + + /* + * Pick the right block to descend on. + */ + if (probe == max) { + blk->index = max - 1; + blkno = be32_to_cpu(btree[max - 1].before); + } else { + blk->index = probe; + blkno = be32_to_cpu(btree[probe].before); } } @@ -1152,22 +1573,23 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) if (blk->magic == XFS_DIR2_LEAFN_MAGIC) { retval = xfs_dir2_leafn_lookup_int(blk->bp, args, &blk->index, state); - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { - retval = xfs_attr_leaf_lookup_int(blk->bp, args); + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + retval = xfs_attr3_leaf_lookup_int(blk->bp, args); blk->index = args->index; args->blkno = blk->blkno; + } else { + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); } if (((retval == ENOENT) || (retval == ENOATTR)) && (blk->hashval == args->hashval)) { - error = xfs_da_path_shift(state, &state->path, 1, 1, + error = xfs_da3_path_shift(state, &state->path, 1, 1, &retval); if (error) return(error); if (retval == 0) { continue; - } - else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { + } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { /* path_shift() gives ENOENT */ retval = XFS_ERROR(ENOATTR); } @@ -1183,40 +1605,75 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result) *========================================================================*/ /* + * Compare two intermediate nodes for "order". + */ +STATIC int +xfs_da3_node_order( + struct xfs_inode *dp, + struct xfs_buf *node1_bp, + struct xfs_buf *node2_bp) +{ + struct xfs_da_intnode *node1; + struct xfs_da_intnode *node2; + struct xfs_da_node_entry *btree1; + struct xfs_da_node_entry *btree2; + struct xfs_da3_icnode_hdr node1hdr; + struct xfs_da3_icnode_hdr node2hdr; + + node1 = node1_bp->b_addr; + node2 = node2_bp->b_addr; + dp->d_ops->node_hdr_from_disk(&node1hdr, node1); + dp->d_ops->node_hdr_from_disk(&node2hdr, node2); + btree1 = dp->d_ops->node_tree_p(node1); + btree2 = dp->d_ops->node_tree_p(node2); + + if (node1hdr.count > 0 && node2hdr.count > 0 && + ((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) || + (be32_to_cpu(btree2[node2hdr.count - 1].hashval) < + be32_to_cpu(btree1[node1hdr.count - 1].hashval)))) { + return 1; + } + return 0; +} + +/* * Link a new block into a doubly linked list of blocks (of whatever type). */ int /* error */ -xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, - xfs_da_state_blk_t *new_blk) +xfs_da3_blk_link( + struct xfs_da_state *state, + struct xfs_da_state_blk *old_blk, + struct xfs_da_state_blk *new_blk) { - xfs_da_blkinfo_t *old_info, *new_info, *tmp_info; - xfs_da_args_t *args; - int before=0, error; - xfs_dabuf_t *bp; + struct xfs_da_blkinfo *old_info; + struct xfs_da_blkinfo *new_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int before = 0; + int error; + struct xfs_inode *dp = state->args->dp; /* * Set up environment. */ args = state->args; ASSERT(args != NULL); - old_info = old_blk->bp->data; - new_info = new_blk->bp->data; + old_info = old_blk->bp->b_addr; + new_info = new_blk->bp->b_addr; ASSERT(old_blk->magic == XFS_DA_NODE_MAGIC || old_blk->magic == XFS_DIR2_LEAFN_MAGIC || old_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(old_blk->magic == be16_to_cpu(old_info->magic)); - ASSERT(new_blk->magic == be16_to_cpu(new_info->magic)); - ASSERT(old_blk->magic == new_blk->magic); switch (old_blk->magic) { case XFS_ATTR_LEAF_MAGIC: before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp); break; case XFS_DIR2_LEAFN_MAGIC: - before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp); + before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp); break; case XFS_DA_NODE_MAGIC: - before = xfs_da_node_order(old_blk->bp, new_blk->bp); + before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp); break; } @@ -1227,114 +1684,77 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, /* * Link new block in before existing block. */ + trace_xfs_da_link_before(args); new_info->forw = cpu_to_be32(old_blk->blkno); new_info->back = old_info->back; if (old_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; - ASSERT(be16_to_cpu(tmp_info->magic) == be16_to_cpu(old_info->magic)); + tmp_info = bp->b_addr; + ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == old_blk->blkno); tmp_info->forw = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->back = cpu_to_be32(new_blk->blkno); } else { /* * Link new block in after existing block. */ + trace_xfs_da_link_after(args); new_info->forw = old_info->forw; new_info->back = cpu_to_be32(old_blk->blkno); if (old_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, dp, be32_to_cpu(old_info->forw), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == old_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == old_blk->blkno); tmp_info->back = cpu_to_be32(new_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); - xfs_da_buf_done(bp); + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info)-1); } old_info->forw = cpu_to_be32(new_blk->blkno); } - xfs_da_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); - xfs_da_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); - return(0); -} - -/* - * Compare two intermediate nodes for "order". - */ -STATIC int -xfs_da_node_order(xfs_dabuf_t *node1_bp, xfs_dabuf_t *node2_bp) -{ - xfs_da_intnode_t *node1, *node2; - - node1 = node1_bp->data; - node2 = node2_bp->data; - ASSERT((be16_to_cpu(node1->hdr.info.magic) == XFS_DA_NODE_MAGIC) && - (be16_to_cpu(node2->hdr.info.magic) == XFS_DA_NODE_MAGIC)); - if ((be16_to_cpu(node1->hdr.count) > 0) && (be16_to_cpu(node2->hdr.count) > 0) && - ((be32_to_cpu(node2->btree[0].hashval) < - be32_to_cpu(node1->btree[0].hashval)) || - (be32_to_cpu(node2->btree[be16_to_cpu(node2->hdr.count)-1].hashval) < - be32_to_cpu(node1->btree[be16_to_cpu(node1->hdr.count)-1].hashval)))) { - return(1); - } + xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); + xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); return(0); } /* - * Pick up the last hashvalue from an intermediate node. - */ -STATIC uint -xfs_da_node_lasthash(xfs_dabuf_t *bp, int *count) -{ - xfs_da_intnode_t *node; - - node = bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - if (count) - *count = be16_to_cpu(node->hdr.count); - if (!node->hdr.count) - return(0); - return be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); -} - -/* * Unlink a block from a doubly linked list of blocks. */ STATIC int /* error */ -xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, - xfs_da_state_blk_t *save_blk) +xfs_da3_blk_unlink( + struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk) { - xfs_da_blkinfo_t *drop_info, *save_info, *tmp_info; - xfs_da_args_t *args; - xfs_dabuf_t *bp; - int error; + struct xfs_da_blkinfo *drop_info; + struct xfs_da_blkinfo *save_info; + struct xfs_da_blkinfo *tmp_info; + struct xfs_da_args *args; + struct xfs_buf *bp; + int error; /* * Set up environment. */ args = state->args; ASSERT(args != NULL); - save_info = save_blk->bp->data; - drop_info = drop_blk->bp->data; + save_info = save_blk->bp->b_addr; + drop_info = drop_blk->bp->b_addr; ASSERT(save_blk->magic == XFS_DA_NODE_MAGIC || save_blk->magic == XFS_DIR2_LEAFN_MAGIC || save_blk->magic == XFS_ATTR_LEAF_MAGIC); - ASSERT(save_blk->magic == be16_to_cpu(save_info->magic)); - ASSERT(drop_blk->magic == be16_to_cpu(drop_info->magic)); ASSERT(save_blk->magic == drop_blk->magic); ASSERT((be32_to_cpu(save_info->forw) == drop_blk->blkno) || (be32_to_cpu(save_info->back) == drop_blk->blkno)); @@ -1345,42 +1765,42 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * Unlink the leaf block from the doubly linked chain of leaves. */ if (be32_to_cpu(save_info->back) == drop_blk->blkno) { + trace_xfs_da_unlink_back(args); save_info->back = drop_info->back; if (drop_info->back) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->back), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->forw) == drop_blk->blkno); tmp_info->forw = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } else { + trace_xfs_da_unlink_forward(args); save_info->forw = drop_info->forw; if (drop_info->forw) { - error = xfs_da_read_buf(args->trans, args->dp, + error = xfs_da3_node_read(args->trans, args->dp, be32_to_cpu(drop_info->forw), -1, &bp, args->whichfork); if (error) return(error); ASSERT(bp != NULL); - tmp_info = bp->data; + tmp_info = bp->b_addr; ASSERT(tmp_info->magic == save_info->magic); ASSERT(be32_to_cpu(tmp_info->back) == drop_blk->blkno); tmp_info->back = cpu_to_be32(save_blk->blkno); - xfs_da_log_buf(args->trans, bp, 0, + xfs_trans_log_buf(args->trans, bp, 0, sizeof(*tmp_info) - 1); - xfs_da_buf_done(bp); } } - xfs_da_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); + xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); return(0); } @@ -1393,15 +1813,25 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk, * the new bottom and the root. */ int /* error */ -xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, - int forward, int release, int *result) +xfs_da3_path_shift( + struct xfs_da_state *state, + struct xfs_da_state_path *path, + int forward, + int release, + int *result) { - xfs_da_state_blk_t *blk; - xfs_da_blkinfo_t *info; - xfs_da_intnode_t *node; - xfs_da_args_t *args; - xfs_dablk_t blkno=0; - int level, error; + struct xfs_da_state_blk *blk; + struct xfs_da_blkinfo *info; + struct xfs_da_intnode *node; + struct xfs_da_args *args; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr nodehdr; + xfs_dablk_t blkno = 0; + int level; + int error; + struct xfs_inode *dp = state->args->dp; + + trace_xfs_da_path_shift(state->args); /* * Roll up the Btree looking for the first block where our @@ -1414,22 +1844,23 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, ASSERT((path->active > 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); level = (path->active-1) - 1; /* skip bottom layer in path */ for (blk = &path->blk[level]; level >= 0; blk--, level--) { - ASSERT(blk->bp != NULL); - node = blk->bp->data; - ASSERT(be16_to_cpu(node->hdr.info.magic) == XFS_DA_NODE_MAGIC); - if (forward && (blk->index < be16_to_cpu(node->hdr.count)-1)) { + node = blk->bp->b_addr; + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + + if (forward && (blk->index < nodehdr.count - 1)) { blk->index++; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } else if (!forward && (blk->index > 0)) { blk->index--; - blkno = be32_to_cpu(node->btree[blk->index].before); + blkno = be32_to_cpu(btree[blk->index].before); break; } } if (level < 0) { *result = XFS_ERROR(ENOENT); /* we're out of our tree */ - ASSERT(args->oknoent); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); return(0); } @@ -1443,51 +1874,65 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * (if it's dirty, trans won't actually let go) */ if (release) - xfs_da_brelse(args->trans, blk->bp); + xfs_trans_brelse(args->trans, blk->bp); /* * Read the next child block. */ blk->blkno = blkno; - error = xfs_da_read_buf(args->trans, args->dp, blkno, -1, - &blk->bp, args->whichfork); + error = xfs_da3_node_read(args->trans, dp, blkno, -1, + &blk->bp, args->whichfork); if (error) return(error); - ASSERT(blk->bp != NULL); - info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DA_NODE_MAGIC || - be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC || - be16_to_cpu(info->magic) == XFS_ATTR_LEAF_MAGIC); - blk->magic = be16_to_cpu(info->magic); - if (blk->magic == XFS_DA_NODE_MAGIC) { + info = blk->bp->b_addr; + ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC) || + info->magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)); + + + /* + * Note: we flatten the magic number to a single type so we + * don't have to compare against crc/non-crc types elsewhere. + */ + switch (be16_to_cpu(info->magic)) { + case XFS_DA_NODE_MAGIC: + case XFS_DA3_NODE_MAGIC: + blk->magic = XFS_DA_NODE_MAGIC; node = (xfs_da_intnode_t *)info; - blk->hashval = be32_to_cpu(node->btree[be16_to_cpu(node->hdr.count)-1].hashval); + dp->d_ops->node_hdr_from_disk(&nodehdr, node); + btree = dp->d_ops->node_tree_p(node); + blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval); if (forward) blk->index = 0; else - blk->index = be16_to_cpu(node->hdr.count)-1; - blkno = be32_to_cpu(node->btree[blk->index].before); - } else { + blk->index = nodehdr.count - 1; + blkno = be32_to_cpu(btree[blk->index].before); + break; + case XFS_ATTR_LEAF_MAGIC: + case XFS_ATTR3_LEAF_MAGIC: + blk->magic = XFS_ATTR_LEAF_MAGIC; ASSERT(level == path->active-1); blk->index = 0; - switch(blk->magic) { - case XFS_ATTR_LEAF_MAGIC: - blk->hashval = xfs_attr_leaf_lasthash(blk->bp, - NULL); - break; - case XFS_DIR2_LEAFN_MAGIC: - blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, - NULL); - break; - default: - ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC || - blk->magic == XFS_DIR2_LEAFN_MAGIC); - break; - } + blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL); + break; + case XFS_DIR2_LEAFN_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + blk->magic = XFS_DIR2_LEAFN_MAGIC; + ASSERT(level == path->active-1); + blk->index = 0; + blk->hashval = xfs_dir2_leafn_lasthash(args->dp, + blk->bp, NULL); + break; + default: + ASSERT(0); + break; } } *result = 0; - return(0); + return 0; } @@ -1501,7 +1946,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, * This is implemented with some source-level loop unrolling. */ xfs_dahash_t -xfs_da_hashname(const uchar_t *name, int namelen) +xfs_da_hashname(const __uint8_t *name, int namelen) { xfs_dahash_t hash; @@ -1528,77 +1973,82 @@ xfs_da_hashname(const uchar_t *name, int namelen) } } -/* - * Add a block to the btree ahead of the file. - * Return the new block number to the caller. - */ +enum xfs_dacmp +xfs_da_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + return (args->namelen == len && memcmp(args->name, name, len) == 0) ? + XFS_CMP_EXACT : XFS_CMP_DIFFERENT; +} + +static xfs_dahash_t +xfs_default_hashname( + struct xfs_name *name) +{ + return xfs_da_hashname(name->name, name->len); +} + +const struct xfs_nameops xfs_default_nameops = { + .hashname = xfs_default_hashname, + .compname = xfs_da_compname +}; + int -xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) +xfs_da_grow_inode_int( + struct xfs_da_args *args, + xfs_fileoff_t *bno, + int count) { - xfs_fileoff_t bno, b; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_inode_t *dp; - int nmap, error, w, count, c, got, i, mapi; - xfs_trans_t *tp; - xfs_mount_t *mp; + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + int w = args->whichfork; + xfs_drfsbno_t nblks = dp->i_d.di_nblocks; + struct xfs_bmbt_irec map, *mapp; + int nmap, error, got, i, mapi; - dp = args->dp; - mp = dp->i_mount; - w = args->whichfork; - tp = args->trans; - /* - * For new directories adjust the file offset and block count. - */ - if (w == XFS_DATA_FORK) { - bno = mp->m_dirleafblk; - count = mp->m_dirblkfsbs; - } else { - bno = 0; - count = 1; - } /* * Find a spot in the file space to put the new block. */ - if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, w))) + error = xfs_bmap_first_unused(tp, dp, count, bno, w); + if (error) return error; - if (w == XFS_DATA_FORK) - ASSERT(bno >= mp->m_dirleafblk && bno < mp->m_dirfreeblk); + /* * Try mapping it in one filesystem block. */ nmap = 1; ASSERT(args->firstblock != NULL); - if ((error = xfs_bmapi(tp, dp, bno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE|XFS_BMAPI_METADATA| - XFS_BMAPI_CONTIG, + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) { + args->flist); + if (error) return error; - } + ASSERT(nmap <= 1); if (nmap == 1) { mapp = ↦ mapi = 1; - } - /* - * If we didn't get it and the block might work if fragmented, - * try without the CONTIG flag. Loop until we get it all. - */ - else if (nmap == 0 && count > 1) { + } else if (nmap == 0 && count > 1) { + xfs_fileoff_t b; + int c; + + /* + * If we didn't get it and the block might work if fragmented, + * try without the CONTIG flag. Loop until we get it all. + */ mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); - for (b = bno, mapi = 0; b < bno + count; ) { + for (b = *bno, mapi = 0; b < *bno + count; ) { nmap = MIN(XFS_BMAP_MAX_NMAP, count); - c = (int)(bno + count - b); - if ((error = xfs_bmapi(tp, dp, b, c, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_WRITE| - XFS_BMAPI_METADATA, + c = (int)(*bno + count - b); + error = xfs_bmapi_write(tp, dp, b, c, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); - return error; - } + &mapp[mapi], &nmap, args->flist); + if (error) + goto out_free_map; if (nmap < 1) break; mapi += nmap; @@ -1609,22 +2059,47 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) mapi = 0; mapp = NULL; } + /* * Count the blocks we got, make sure it matches the total. */ for (i = 0, got = 0; i < mapi; i++) got += mapp[i].br_blockcount; - if (got != count || mapp[0].br_startoff != bno || + if (got != count || mapp[0].br_startoff != *bno || mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != - bno + count) { - if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); - return XFS_ERROR(ENOSPC); + *bno + count) { + error = XFS_ERROR(ENOSPC); + goto out_free_map; } + + /* account for newly allocated blocks in reserved blocks total */ + args->total -= dp->i_d.di_nblocks - nblks; + +out_free_map: if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); - *new_blkno = (xfs_dablk_t)bno; - return 0; + kmem_free(mapp); + return error; +} + +/* + * Add a block to the btree ahead of the file. + * Return the new block number to the caller. + */ +int +xfs_da_grow_inode( + struct xfs_da_args *args, + xfs_dablk_t *new_blkno) +{ + xfs_fileoff_t bno; + int error; + + trace_xfs_da_grow_inode(args); + + bno = args->geo->leafblk; + error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount); + if (!error) + *new_blkno = (xfs_dablk_t)bno; + return error; } /* @@ -1636,30 +2111,48 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno) * a bmap btree split to do that. */ STATIC int -xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, - xfs_dabuf_t **dead_bufp) +xfs_da3_swap_lastblock( + struct xfs_da_args *args, + xfs_dablk_t *dead_blknop, + struct xfs_buf **dead_bufp) { - xfs_dablk_t dead_blkno, last_blkno, sib_blkno, par_blkno; - xfs_dabuf_t *dead_buf, *last_buf, *sib_buf, *par_buf; - xfs_fileoff_t lastoff; - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error, w, entno, level, dead_level; - xfs_da_blkinfo_t *dead_info, *sib_info; - xfs_da_intnode_t *par_node, *dead_node; - xfs_dir2_leaf_t *dead_leaf2; - xfs_dahash_t dead_hash; + struct xfs_da_blkinfo *dead_info; + struct xfs_da_blkinfo *sib_info; + struct xfs_da_intnode *par_node; + struct xfs_da_intnode *dead_node; + struct xfs_dir2_leaf *dead_leaf2; + struct xfs_da_node_entry *btree; + struct xfs_da3_icnode_hdr par_hdr; + struct xfs_inode *dp; + struct xfs_trans *tp; + struct xfs_mount *mp; + struct xfs_buf *dead_buf; + struct xfs_buf *last_buf; + struct xfs_buf *sib_buf; + struct xfs_buf *par_buf; + xfs_dahash_t dead_hash; + xfs_fileoff_t lastoff; + xfs_dablk_t dead_blkno; + xfs_dablk_t last_blkno; + xfs_dablk_t sib_blkno; + xfs_dablk_t par_blkno; + int error; + int w; + int entno; + int level; + int dead_level; + + trace_xfs_da_swap_lastblock(args); dead_buf = *dead_bufp; dead_blkno = *dead_blknop; tp = args->trans; - ip = args->dp; + dp = args->dp; w = args->whichfork; ASSERT(w == XFS_DATA_FORK); - mp = ip->i_mount; - lastoff = mp->m_dirfreeblk; - error = xfs_bmap_last_before(tp, ip, &lastoff, w); + mp = dp->i_mount; + lastoff = args->geo->freeblk; + error = xfs_bmap_last_before(tp, dp, &lastoff, w); if (error) return error; if (unlikely(lastoff == 0)) { @@ -1670,36 +2163,47 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, /* * Read the last block in the btree space. */ - last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs; - if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w))) + last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount; + error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w); + if (error) return error; /* * Copy the last block into the dead buffer and log it. */ - memcpy(dead_buf->data, last_buf->data, mp->m_dirblksize); - xfs_da_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1); - dead_info = dead_buf->data; + memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize); + xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1); + dead_info = dead_buf->b_addr; /* * Get values from the moved block. */ - if (be16_to_cpu(dead_info->magic) == XFS_DIR2_LEAFN_MAGIC) { + if (dead_info->magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + dead_info->magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + dead_leaf2 = (xfs_dir2_leaf_t *)dead_info; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2); + ents = dp->d_ops->leaf_ents_p(dead_leaf2); dead_level = 0; - dead_hash = be32_to_cpu(dead_leaf2->ents[be16_to_cpu(dead_leaf2->hdr.count) - 1].hashval); + dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval); } else { - ASSERT(be16_to_cpu(dead_info->magic) == XFS_DA_NODE_MAGIC); + struct xfs_da3_icnode_hdr deadhdr; + dead_node = (xfs_da_intnode_t *)dead_info; - dead_level = be16_to_cpu(dead_node->hdr.level); - dead_hash = be32_to_cpu(dead_node->btree[be16_to_cpu(dead_node->hdr.count) - 1].hashval); + dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node); + btree = dp->d_ops->node_tree_p(dead_node); + dead_level = deadhdr.level; + dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval); } sib_buf = par_buf = NULL; /* * If the moved block has a left sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->back))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->forw) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1709,19 +2213,19 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->forw = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->forw, sizeof(sib_info->forw))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } /* * If the moved block has a right sibling, fix up the pointers. */ if ((sib_blkno = be32_to_cpu(dead_info->forw))) { - if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w))) + error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w); + if (error) goto done; - sib_info = sib_buf->data; + sib_info = sib_buf->b_addr; if (unlikely( be32_to_cpu(sib_info->back) != last_blkno || sib_info->magic != dead_info->magic)) { @@ -1731,45 +2235,45 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, goto done; } sib_info->back = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, sib_buf, + xfs_trans_log_buf(tp, sib_buf, XFS_DA_LOGRANGE(sib_info, &sib_info->back, sizeof(sib_info->back))); - xfs_da_buf_done(sib_buf); sib_buf = NULL; } - par_blkno = mp->m_dirleafblk; + par_blkno = args->geo->leafblk; level = -1; /* * Walk down the tree looking for the parent of the moved block. */ for (;;) { - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) goto done; - par_node = par_buf->data; - if (unlikely( - be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC || - (level >= 0 && level != be16_to_cpu(par_node->hdr.level) + 1))) { + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (level >= 0 && level != par_hdr.level + 1) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - level = be16_to_cpu(par_node->hdr.level); + level = par_hdr.level; + btree = dp->d_ops->node_tree_p(par_node); for (entno = 0; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].hashval) < dead_hash; + entno < par_hdr.count && + be32_to_cpu(btree[entno].hashval) < dead_hash; entno++) continue; - if (unlikely(entno == be16_to_cpu(par_node->hdr.count))) { + if (entno == par_hdr.count) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } - par_blkno = be32_to_cpu(par_node->btree[entno].before); + par_blkno = be32_to_cpu(btree[entno].before); if (level == dead_level + 1) break; - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); par_buf = NULL; } /* @@ -1778,14 +2282,14 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, */ for (;;) { for (; - entno < be16_to_cpu(par_node->hdr.count) && - be32_to_cpu(par_node->btree[entno].before) != last_blkno; + entno < par_hdr.count && + be32_to_cpu(btree[entno].before) != last_blkno; entno++) continue; - if (entno < be16_to_cpu(par_node->hdr.count)) + if (entno < par_hdr.count) break; - par_blkno = be32_to_cpu(par_node->hdr.info.forw); - xfs_da_brelse(tp, par_buf); + par_blkno = par_hdr.forw; + xfs_trans_brelse(tp, par_buf); par_buf = NULL; if (unlikely(par_blkno == 0)) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", @@ -1793,37 +2297,36 @@ xfs_da_swap_lastblock(xfs_da_args_t *args, xfs_dablk_t *dead_blknop, error = XFS_ERROR(EFSCORRUPTED); goto done; } - if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w))) + error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); + if (error) goto done; - par_node = par_buf->data; - if (unlikely( - be16_to_cpu(par_node->hdr.level) != level || - be16_to_cpu(par_node->hdr.info.magic) != XFS_DA_NODE_MAGIC)) { + par_node = par_buf->b_addr; + dp->d_ops->node_hdr_from_disk(&par_hdr, par_node); + if (par_hdr.level != level) { XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); goto done; } + btree = dp->d_ops->node_tree_p(par_node); entno = 0; } /* * Update the parent entry pointing to the moved block. */ - par_node->btree[entno].before = cpu_to_be32(dead_blkno); - xfs_da_log_buf(tp, par_buf, - XFS_DA_LOGRANGE(par_node, &par_node->btree[entno].before, - sizeof(par_node->btree[entno].before))); - xfs_da_buf_done(par_buf); - xfs_da_buf_done(dead_buf); + btree[entno].before = cpu_to_be32(dead_blkno); + xfs_trans_log_buf(tp, par_buf, + XFS_DA_LOGRANGE(par_node, &btree[entno].before, + sizeof(btree[entno].before))); *dead_blknop = last_blkno; *dead_bufp = last_buf; return 0; done: if (par_buf) - xfs_da_brelse(tp, par_buf); + xfs_trans_brelse(tp, par_buf); if (sib_buf) - xfs_da_brelse(tp, sib_buf); - xfs_da_brelse(tp, last_buf); + xfs_trans_brelse(tp, sib_buf); + xfs_trans_brelse(tp, last_buf); return error; } @@ -1831,41 +2334,43 @@ done: * Remove a btree block from a directory or attribute. */ int -xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf) +xfs_da_shrink_inode( + xfs_da_args_t *args, + xfs_dablk_t dead_blkno, + struct xfs_buf *dead_buf) { xfs_inode_t *dp; int done, error, w, count; xfs_trans_t *tp; xfs_mount_t *mp; + trace_xfs_da_shrink_inode(args); + dp = args->dp; w = args->whichfork; tp = args->trans; mp = dp->i_mount; - if (w == XFS_DATA_FORK) - count = mp->m_dirblkfsbs; - else - count = 1; + count = args->geo->fsbcount; for (;;) { /* * Remove extents. If we get ENOSPC for a dir we have to move * the last block to the place we want to kill. */ - if ((error = xfs_bunmapi(tp, dp, dead_blkno, count, - XFS_BMAPI_AFLAG(w)|XFS_BMAPI_METADATA, - 0, args->firstblock, args->flist, NULL, - &done)) == ENOSPC) { + error = xfs_bunmapi(tp, dp, dead_blkno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, + 0, args->firstblock, args->flist, &done); + if (error == ENOSPC) { if (w != XFS_DATA_FORK) break; - if ((error = xfs_da_swap_lastblock(args, &dead_blkno, - &dead_buf))) + error = xfs_da3_swap_lastblock(args, &dead_blkno, + &dead_buf); + if (error) break; } else { break; } } - xfs_da_binval(tp, dead_buf); + xfs_trans_binval(tp, dead_buf); return error; } @@ -1897,36 +2402,78 @@ xfs_da_map_covers_blocks( } /* - * Make a dabuf. - * Used for get_buf, read_buf, read_bufr, and reada_buf. + * Convert a struct xfs_bmbt_irec to a struct xfs_buf_map. + * + * For the single map case, it is assumed that the caller has provided a pointer + * to a valid xfs_buf_map. For the multiple map case, this function will + * allocate the xfs_buf_map to hold all the maps and replace the caller's single + * map pointer with the allocated map. */ -STATIC int -xfs_da_do_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t *mappedbnop, - xfs_dabuf_t **bpp, - int whichfork, - int caller, - inst_t *ra) +static int +xfs_buf_map_from_irec( + struct xfs_mount *mp, + struct xfs_buf_map **mapp, + int *nmaps, + struct xfs_bmbt_irec *irecs, + int nirecs) { - xfs_buf_t *bp = NULL; - xfs_buf_t **bplist; - int error=0; - int i; - xfs_bmbt_irec_t map; - xfs_bmbt_irec_t *mapp; - xfs_daddr_t mappedbno; - xfs_mount_t *mp; - int nbplist=0; - int nfsb; - int nmap; - xfs_dabuf_t *rbp; + struct xfs_buf_map *map; + int i; + + ASSERT(*nmaps == 1); + ASSERT(nirecs >= 1); + + if (nirecs > 1) { + map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), + KM_SLEEP | KM_NOFS); + if (!map) + return ENOMEM; + *mapp = map; + } + + *nmaps = nirecs; + map = *mapp; + for (i = 0; i < *nmaps; i++) { + ASSERT(irecs[i].br_startblock != DELAYSTARTBLOCK && + irecs[i].br_startblock != HOLESTARTBLOCK); + map[i].bm_bn = XFS_FSB_TO_DADDR(mp, irecs[i].br_startblock); + map[i].bm_len = XFS_FSB_TO_BB(mp, irecs[i].br_blockcount); + } + return 0; +} + +/* + * Map the block we are given ready for reading. There are three possible return + * values: + * -1 - will be returned if we land in a hole and mappedbno == -2 so the + * caller knows not to execute a subsequent read. + * 0 - if we mapped the block successfully + * >0 - positive error number if there was an error. + */ +static int +xfs_dabuf_map( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + struct xfs_buf_map **map, + int *nmaps) +{ + struct xfs_mount *mp = dp->i_mount; + int nfsb; + int error = 0; + struct xfs_bmbt_irec irec; + struct xfs_bmbt_irec *irecs = &irec; + int nirecs; + + ASSERT(map && *map); + ASSERT(*nmaps == 1); + + if (whichfork == XFS_DATA_FORK) + nfsb = mp->m_dir_geo->fsbcount; + else + nfsb = mp->m_attr_geo->fsbcount; - mp = dp->i_mount; - nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1; - mappedbno = *mappedbnop; /* * Caller doesn't have a mapping. -2 means don't complain * if we land in a hole. @@ -1935,179 +2482,50 @@ xfs_da_do_buf( /* * Optimize the one-block case. */ - if (nfsb == 1) { - xfs_fsblock_t fsb; + if (nfsb != 1) + irecs = kmem_zalloc(sizeof(irec) * nfsb, + KM_SLEEP | KM_NOFS); - if ((error = - xfs_bmapi_single(trans, dp, whichfork, &fsb, - (xfs_fileoff_t)bno))) { - return error; - } - mapp = ↦ - if (fsb == NULLFSBLOCK) { - nmap = 0; - } else { - map.br_startblock = fsb; - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = 1; - nmap = 1; - } - } else { - mapp = kmem_alloc(sizeof(*mapp) * nfsb, KM_SLEEP); - nmap = nfsb; - if ((error = xfs_bmapi(trans, dp, (xfs_fileoff_t)bno, - nfsb, - XFS_BMAPI_METADATA | - XFS_BMAPI_AFLAG(whichfork), - NULL, 0, mapp, &nmap, NULL, NULL))) - goto exit0; - } + nirecs = nfsb; + error = xfs_bmapi_read(dp, (xfs_fileoff_t)bno, nfsb, irecs, + &nirecs, xfs_bmapi_aflag(whichfork)); + if (error) + goto out; } else { - map.br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); - map.br_startoff = (xfs_fileoff_t)bno; - map.br_blockcount = nfsb; - mapp = ↦ - nmap = 1; + irecs->br_startblock = XFS_DADDR_TO_FSB(mp, mappedbno); + irecs->br_startoff = (xfs_fileoff_t)bno; + irecs->br_blockcount = nfsb; + irecs->br_state = 0; + nirecs = 1; } - if (!xfs_da_map_covers_blocks(nmap, mapp, bno, nfsb)) { - error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); + + if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { + error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); if (unlikely(error == EFSCORRUPTED)) { if (xfs_error_level >= XFS_ERRLEVEL_LOW) { - int i; - cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", - (long long)bno); - cmn_err(CE_ALERT, "dir: inode %lld\n", + int i; + xfs_alert(mp, "%s: bno %lld dir: inode %lld", + __func__, (long long)bno, (long long)dp->i_ino); - for (i = 0; i < nmap; i++) { - cmn_err(CE_ALERT, - "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", + for (i = 0; i < *nmaps; i++) { + xfs_alert(mp, +"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d", i, - (long long)mapp[i].br_startoff, - (long long)mapp[i].br_startblock, - (long long)mapp[i].br_blockcount, - mapp[i].br_state); + (long long)irecs[i].br_startoff, + (long long)irecs[i].br_startblock, + (long long)irecs[i].br_blockcount, + irecs[i].br_state); } } XFS_ERROR_REPORT("xfs_da_do_buf(1)", XFS_ERRLEVEL_LOW, mp); } - goto exit0; - } - if (caller != 3 && nmap > 1) { - bplist = kmem_alloc(sizeof(*bplist) * nmap, KM_SLEEP); - nbplist = 0; - } else - bplist = NULL; - /* - * Turn the mapping(s) into buffer(s). - */ - for (i = 0; i < nmap; i++) { - int nmapped; - - mappedbno = XFS_FSB_TO_DADDR(mp, mapp[i].br_startblock); - if (i == 0) - *mappedbnop = mappedbno; - nmapped = (int)XFS_FSB_TO_BB(mp, mapp[i].br_blockcount); - switch (caller) { - case 0: - bp = xfs_trans_get_buf(trans, mp->m_ddev_targp, - mappedbno, nmapped, 0); - error = bp ? XFS_BUF_GETERROR(bp) : XFS_ERROR(EIO); - break; - case 1: - case 2: - bp = NULL; - error = xfs_trans_read_buf(mp, trans, mp->m_ddev_targp, - mappedbno, nmapped, 0, &bp); - break; - case 3: - xfs_baread(mp->m_ddev_targp, mappedbno, nmapped); - error = 0; - bp = NULL; - break; - } - if (error) { - if (bp) - xfs_trans_brelse(trans, bp); - goto exit1; - } - if (!bp) - continue; - if (caller == 1) { - if (whichfork == XFS_ATTR_FORK) { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_ATTR_BTREE, - XFS_ATTR_BTREE_REF); - } else { - XFS_BUF_SET_VTYPE_REF(bp, B_FS_DIR_BTREE, - XFS_DIR_BTREE_REF); - } - } - if (bplist) { - bplist[nbplist++] = bp; - } - } - /* - * Build a dabuf structure. - */ - if (bplist) { - rbp = xfs_da_buf_make(nbplist, bplist, ra); - } else if (bp) - rbp = xfs_da_buf_make(1, &bp, ra); - else - rbp = NULL; - /* - * For read_buf, check the magic number. - */ - if (caller == 1) { - xfs_dir2_data_t *data; - xfs_dir2_free_t *free; - xfs_da_blkinfo_t *info; - uint magic, magic1; - - info = rbp->data; - data = rbp->data; - free = rbp->data; - magic = be16_to_cpu(info->magic); - magic1 = be32_to_cpu(data->hdr.magic); - if (unlikely( - XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) && - (magic != XFS_ATTR_LEAF_MAGIC) && - (magic != XFS_DIR2_LEAF1_MAGIC) && - (magic != XFS_DIR2_LEAFN_MAGIC) && - (magic1 != XFS_DIR2_BLOCK_MAGIC) && - (magic1 != XFS_DIR2_DATA_MAGIC) && - (be32_to_cpu(free->hdr.magic) != XFS_DIR2_FREE_MAGIC), - mp, XFS_ERRTAG_DA_READ_BUF, - XFS_RANDOM_DA_READ_BUF))) { - xfs_buftrace("DA READ ERROR", rbp->bps[0]); - XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)", - XFS_ERRLEVEL_LOW, mp, info); - error = XFS_ERROR(EFSCORRUPTED); - xfs_da_brelse(trans, rbp); - nbplist = 0; - goto exit1; - } - } - if (bplist) { - kmem_free(bplist, sizeof(*bplist) * nmap); - } - if (mapp != &map) { - kmem_free(mapp, sizeof(*mapp) * nfsb); + goto out; } - if (bpp) - *bpp = rbp; - return 0; -exit1: - if (bplist) { - for (i = 0; i < nbplist; i++) - xfs_trans_brelse(trans, bplist[i]); - kmem_free(bplist, sizeof(*bplist) * nmap); - } -exit0: - if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * nfsb); - if (bpp) - *bpp = NULL; + error = xfs_buf_map_from_irec(mp, map, nmaps, irecs, nirecs); +out: + if (irecs != &irec) + kmem_free(irecs); return error; } @@ -2116,346 +2534,132 @@ exit0: */ int xfs_da_get_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) -{ - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 0, - (inst_t *)__return_address); -} - -/* - * Get a buffer for the dir/attr block, fill in the contents. - */ -int -xfs_da_read_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, - int whichfork) + struct xfs_buf **bpp, + int whichfork) { - return xfs_da_do_buf(trans, dp, bno, &mappedbno, bpp, whichfork, 1, - (inst_t *)__return_address); -} - -/* - * Readahead the dir/attr block. - */ -xfs_daddr_t -xfs_da_reada_buf( - xfs_trans_t *trans, - xfs_inode_t *dp, - xfs_dablk_t bno, - int whichfork) -{ - xfs_daddr_t rval; - - rval = -1; - if (xfs_da_do_buf(trans, dp, bno, &rval, NULL, whichfork, 3, - (inst_t *)__return_address)) - return -1; - else - return rval; -} - -/* - * Calculate the number of bits needed to hold i different values. - */ -uint -xfs_da_log2_roundup(uint i) -{ - uint rval; - - for (rval = 0; rval < NBBY * sizeof(i); rval++) { - if ((1 << rval) >= i) - break; - } - return(rval); -} - -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ -kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ - -/* - * Allocate a dir-state structure. - * We don't put them on the stack since they're large. - */ -xfs_da_state_t * -xfs_da_state_alloc(void) -{ - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); -} - -/* - * Kill the altpath contents of a da-state structure. - */ -STATIC void -xfs_da_state_kill_altpath(xfs_da_state_t *state) -{ - int i; - - for (i = 0; i < state->altpath.active; i++) { - if (state->altpath.blk[i].bp) { - if (state->altpath.blk[i].bp != state->path.blk[i].bp) - xfs_da_buf_done(state->altpath.blk[i].bp); - state->altpath.blk[i].bp = NULL; - } + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; } - state->altpath.active = 0; -} - -/* - * Free a da-state structure. - */ -void -xfs_da_state_free(xfs_da_state_t *state) -{ - int i; - xfs_da_state_kill_altpath(state); - for (i = 0; i < state->path.active; i++) { - if (state->path.blk[i].bp) - xfs_da_buf_done(state->path.blk[i].bp); + bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, + mapp, nmap, 0); + error = bp ? bp->b_error : XFS_ERROR(EIO); + if (error) { + xfs_trans_brelse(trans, bp); + goto out_free; } - if (state->extravalid && state->extrablk.bp) - xfs_da_buf_done(state->extrablk.bp); -#ifdef DEBUG - memset((char *)state, 0, sizeof(*state)); -#endif /* DEBUG */ - kmem_zone_free(xfs_da_state_zone, state); -} -#ifdef XFS_DABUF_DEBUG -xfs_dabuf_t *xfs_dabuf_global_list; -lock_t xfs_dabuf_global_lock; -#endif + *bpp = bp; -/* - * Create a dabuf. - */ -/* ARGSUSED */ -STATIC xfs_dabuf_t * -xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) -{ - xfs_buf_t *bp; - xfs_dabuf_t *dabuf; - int i; - int off; +out_free: + if (mapp != &map) + kmem_free(mapp); - if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); - else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); - dabuf->dirty = 0; -#ifdef XFS_DABUF_DEBUG - dabuf->ra = ra; - dabuf->target = XFS_BUF_TARGET(bps[0]); - dabuf->blkno = XFS_BUF_ADDR(bps[0]); -#endif - if (nbuf == 1) { - dabuf->nbuf = 1; - bp = bps[0]; - dabuf->bbcount = (short)BTOBB(XFS_BUF_COUNT(bp)); - dabuf->data = XFS_BUF_PTR(bp); - dabuf->bps[0] = bp; - } else { - dabuf->nbuf = nbuf; - for (i = 0, dabuf->bbcount = 0; i < nbuf; i++) { - dabuf->bps[i] = bp = bps[i]; - dabuf->bbcount += BTOBB(XFS_BUF_COUNT(bp)); - } - dabuf->data = kmem_alloc(BBTOB(dabuf->bbcount), KM_SLEEP); - for (i = off = 0; i < nbuf; i++, off += XFS_BUF_COUNT(bp)) { - bp = bps[i]; - memcpy((char *)dabuf->data + off, XFS_BUF_PTR(bp), - XFS_BUF_COUNT(bp)); - } - } -#ifdef XFS_DABUF_DEBUG - { - SPLDECL(s); - xfs_dabuf_t *p; - - s = mutex_spinlock(&xfs_dabuf_global_lock); - for (p = xfs_dabuf_global_list; p; p = p->next) { - ASSERT(p->blkno != dabuf->blkno || - p->target != dabuf->target); - } - dabuf->prev = NULL; - if (xfs_dabuf_global_list) - xfs_dabuf_global_list->prev = dabuf; - dabuf->next = xfs_dabuf_global_list; - xfs_dabuf_global_list = dabuf; - mutex_spinunlock(&xfs_dabuf_global_lock, s); - } -#endif - return dabuf; + return error; } /* - * Un-dirty a dabuf. + * Get a buffer for the dir/attr block, fill in the contents. */ -STATIC void -xfs_da_buf_clean(xfs_dabuf_t *dabuf) +int +xfs_da_read_buf( + struct xfs_trans *trans, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp, + int whichfork, + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - int i; - int off; - - if (dabuf->dirty) { - ASSERT(dabuf->nbuf > 1); - dabuf->dirty = 0; - for (i = off = 0; i < dabuf->nbuf; - i++, off += XFS_BUF_COUNT(bp)) { - bp = dabuf->bps[i]; - memcpy(XFS_BUF_PTR(bp), (char *)dabuf->data + off, - XFS_BUF_COUNT(bp)); - } + struct xfs_buf *bp; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; + + *bpp = NULL; + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; } -} -/* - * Release a dabuf. - */ -void -xfs_da_buf_done(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->dirty) - xfs_da_buf_clean(dabuf); - if (dabuf->nbuf > 1) - kmem_free(dabuf->data, BBTOB(dabuf->bbcount)); -#ifdef XFS_DABUF_DEBUG - { - SPLDECL(s); - - s = mutex_spinlock(&xfs_dabuf_global_lock); - if (dabuf->prev) - dabuf->prev->next = dabuf->next; - else - xfs_dabuf_global_list = dabuf->next; - if (dabuf->next) - dabuf->next->prev = dabuf->prev; - mutex_spinunlock(&xfs_dabuf_global_lock, s); - } - memset(dabuf, 0, XFS_DA_BUF_SIZE(dabuf->nbuf)); -#endif - if (dabuf->nbuf == 1) - kmem_zone_free(xfs_dabuf_zone, dabuf); - else - kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf)); -} + error = xfs_trans_read_buf_map(dp->i_mount, trans, + dp->i_mount->m_ddev_targp, + mapp, nmap, 0, &bp, ops); + if (error) + goto out_free; -/* - * Log transaction from a dabuf. - */ -void -xfs_da_log_buf(xfs_trans_t *tp, xfs_dabuf_t *dabuf, uint first, uint last) -{ - xfs_buf_t *bp; - uint f; - int i; - uint l; - int off; + if (whichfork == XFS_ATTR_FORK) + xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF); + else + xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF); + *bpp = bp; +out_free: + if (mapp != &map) + kmem_free(mapp); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if (dabuf->nbuf == 1) { - ASSERT(dabuf->data == (void *)XFS_BUF_PTR(dabuf->bps[0])); - xfs_trans_log_buf(tp, dabuf->bps[0], first, last); - return; - } - dabuf->dirty = 1; - ASSERT(first <= last); - for (i = off = 0; i < dabuf->nbuf; i++, off += XFS_BUF_COUNT(bp)) { - bp = dabuf->bps[i]; - f = off; - l = f + XFS_BUF_COUNT(bp) - 1; - if (f < first) - f = first; - if (l > last) - l = last; - if (f <= l) - xfs_trans_log_buf(tp, bp, f - off, l - off); - /* - * B_DONE is set by xfs_trans_log buf. - * If we don't set it on a new buffer (get not read) - * then if we don't put anything in the buffer it won't - * be set, and at commit it it released into the cache, - * and then a read will fail. - */ - else if (!(XFS_BUF_ISDONE(bp))) - XFS_BUF_DONE(bp); - } - ASSERT(last < off); + return error; } /* - * Release dabuf from a transaction. - * Have to free up the dabuf before the buffers are released, - * since the synchronization on the dabuf is really the lock on the buffer. + * Readahead the dir/attr block. */ -void -xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf) +xfs_daddr_t +xfs_da_reada_buf( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mappedbno, + int whichfork, + const struct xfs_buf_ops *ops) { - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; + struct xfs_buf_map map; + struct xfs_buf_map *mapp; + int nmap; + int error; - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); + mapp = ↦ + nmap = 1; + error = xfs_dabuf_map(dp, bno, mappedbno, whichfork, + &mapp, &nmap); + if (error) { + /* mapping a hole is not an error, but we don't continue */ + if (error == -1) + error = 0; + goto out_free; } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_brelse(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); -} -/* - * Invalidate dabuf from a transaction. - */ -void -xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf) -{ - xfs_buf_t *bp; - xfs_buf_t **bplist; - int i; - int nbuf; + mappedbno = mapp[0].bm_bn; + xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops); - ASSERT(dabuf->nbuf && dabuf->data && dabuf->bbcount && dabuf->bps[0]); - if ((nbuf = dabuf->nbuf) == 1) { - bplist = &bp; - bp = dabuf->bps[0]; - } else { - bplist = kmem_alloc(nbuf * sizeof(*bplist), KM_SLEEP); - memcpy(bplist, dabuf->bps, nbuf * sizeof(*bplist)); - } - xfs_da_buf_done(dabuf); - for (i = 0; i < nbuf; i++) - xfs_trans_binval(tp, bplist[i]); - if (bplist != &bp) - kmem_free(bplist, nbuf * sizeof(*bplist)); -} +out_free: + if (mapp != &map) + kmem_free(mapp); -/* - * Get the first daddr from a dabuf. - */ -xfs_daddr_t -xfs_da_blkno(xfs_dabuf_t *dabuf) -{ - ASSERT(dabuf->nbuf); - ASSERT(dabuf->data); - return XFS_BUF_ADDR(dabuf->bps[0]); + if (error) + return -1; + return mappedbno; } diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 4ab865ec8b8..6e153e399a7 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -18,93 +19,51 @@ #ifndef __XFS_DA_BTREE_H__ #define __XFS_DA_BTREE_H__ -struct xfs_buf; struct xfs_bmap_free; struct xfs_inode; -struct xfs_mount; struct xfs_trans; struct zone; - -/*======================================================================== - * Directory Structure when greater than XFS_LBSIZE(mp) bytes. - *========================================================================*/ +struct xfs_dir_ops; /* - * This structure is common to both leaf nodes and non-leaf nodes in the Btree. - * - * Is is used to manage a doubly linked list of all blocks at the same - * level in the Btree, and to identify which type of block this is. + * Directory/attribute geometry information. There will be one of these for each + * data fork type, and it will be passed around via the xfs_da_args. Global + * structures will be attached to the xfs_mount. */ -#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ -#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ - -typedef struct xfs_da_blkinfo { - __be32 forw; /* previous block in list */ - __be32 back; /* following block in list */ - __be16 magic; /* validity check on block */ - __be16 pad; /* unused */ -} xfs_da_blkinfo_t; - -/* - * This is the structure of the root and intermediate nodes in the Btree. - * The leaf nodes are defined above. - * - * Entries are not packed. - * - * Since we have duplicate keys, use a binary search but always follow - * all match in the block, not just the first match found. - */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ - -typedef struct xfs_da_intnode { - struct xfs_da_node_hdr { /* constant-structure header block */ - xfs_da_blkinfo_t info; /* block type, links, etc. */ - __be16 count; /* count of active entries */ - __be16 level; /* level above leaves (leaf == 0) */ - } hdr; - struct xfs_da_node_entry { - __be32 hashval; /* hash value for this descendant */ - __be32 before; /* Btree block before this key */ - } btree[1]; /* variable sized array of keys */ -} xfs_da_intnode_t; -typedef struct xfs_da_node_hdr xfs_da_node_hdr_t; -typedef struct xfs_da_node_entry xfs_da_node_entry_t; - -#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */ - -#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize -#define XFS_LBLOG(mp) (mp)->m_sb.sb_blocklog - -#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \ - (((bno) << (mp)->m_dircook_elog) | (entry)) -#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \ - (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash)) -#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)cookie) -#define XFS_DA_COOKIE_BNO(mp,cookie) \ - ((((xfs_off_t)(cookie) >> 31) == -1LL ? \ - (xfs_dablk_t)0 : \ - (xfs_dablk_t)((xfs_off_t)(cookie) >> \ - ((mp)->m_dircook_elog + 32)))) -#define XFS_DA_COOKIE_ENTRY(mp,cookie) \ - ((((xfs_off_t)(cookie) >> 31) == -1LL ? \ - (xfs_dablk_t)0 : \ - (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \ - ((1 << (mp)->m_dircook_elog) - 1)))) - +struct xfs_da_geometry { + int blksize; /* da block size in bytes */ + int fsbcount; /* da block size in filesystem blocks */ + uint8_t fsblog; /* log2 of _filesystem_ block size */ + uint8_t blklog; /* log2 of da block size */ + uint node_ents; /* # of entries in a danode */ + int magicpct; /* 37% of block size in bytes */ + xfs_dablk_t datablk; /* blockno of dir data v2 */ + xfs_dablk_t leafblk; /* blockno of leaf data v2 */ + xfs_dablk_t freeblk; /* blockno of free data v2 */ +}; /*======================================================================== * Btree searching and modification structure definitions. *========================================================================*/ /* + * Search comparison results + */ +enum xfs_dacmp { + XFS_CMP_DIFFERENT, /* names are completely different */ + XFS_CMP_EXACT, /* names are exactly the same */ + XFS_CMP_CASE /* names are same but differ in case */ +}; + +/* * Structure to ease passing around component names. */ typedef struct xfs_da_args { - const uchar_t *name; /* string (maybe not NULL terminated) */ + struct xfs_da_geometry *geo; /* da block geometry */ + const __uint8_t *name; /* string (maybe not NULL terminated) */ int namelen; /* length of string (maybe no NULL) */ - uchar_t *value; /* set of bytes (maybe contain NULLs) */ + __uint8_t filetype; /* filetype of inode for directories */ + __uint8_t *value; /* set of bytes (maybe contain NULLs) */ int valuelen; /* length of value */ int flags; /* argument flags (eg: ATTR_NOCREATE) */ xfs_dahash_t hashval; /* hash value of name */ @@ -119,44 +78,31 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ - unsigned char justcheck; /* T/F: check for ok with no space */ - unsigned char rename; /* T/F: this is an atomic rename op */ - unsigned char addname; /* T/F: this is an add operation */ - unsigned char oknoent; /* T/F: ok to return ENOENT, else die */ + int rmtvaluelen2; /* remote attr value length in bytes */ + int op_flags; /* operation flags */ + enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; /* - * Structure to describe buffer(s) for a block. - * This is needed in the directory version 2 format case, when - * multiple non-contiguous fsblocks might be needed to cover one - * logical directory block. - * If the buffer count is 1 then the data pointer points to the - * same place as the b_addr field for the buffer, else to kmem_alloced memory. + * Operation flags: */ -typedef struct xfs_dabuf { - int nbuf; /* number of buffer pointers present */ - short dirty; /* data needs to be copied back */ - short bbcount; /* how large is data in bbs */ - void *data; /* pointer for buffers' data */ -#ifdef XFS_DABUF_DEBUG - inst_t *ra; /* return address of caller to make */ - struct xfs_dabuf *next; /* next in global chain */ - struct xfs_dabuf *prev; /* previous in global chain */ - struct xfs_buftarg *target; /* device for buffer */ - xfs_daddr_t blkno; /* daddr first in bps[0] */ -#endif - struct xfs_buf *bps[1]; /* actually nbuf of these */ -} xfs_dabuf_t; -#define XFS_DA_BUF_SIZE(n) \ - (sizeof(xfs_dabuf_t) + sizeof(struct xfs_buf *) * ((n) - 1)) - -#ifdef XFS_DABUF_DEBUG -extern xfs_dabuf_t *xfs_dabuf_global_list; -#endif +#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */ +#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */ +#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */ +#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */ +#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */ + +#define XFS_DA_OP_FLAGS \ + { XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \ + { XFS_DA_OP_RENAME, "RENAME" }, \ + { XFS_DA_OP_ADDNAME, "ADDNAME" }, \ + { XFS_DA_OP_OKNOENT, "OKNOENT" }, \ + { XFS_DA_OP_CILOOKUP, "CILOOKUP" } /* * Storage for holding state during Btree searches and split/join ops. @@ -166,7 +112,7 @@ extern xfs_dabuf_t *xfs_dabuf_global_list; * which is slightly more than enough. */ typedef struct xfs_da_state_blk { - xfs_dabuf_t *bp; /* buffer containing block */ + struct xfs_buf *bp; /* buffer containing block */ xfs_dablk_t blkno; /* filesystem blkno of buffer */ xfs_daddr_t disk_blkno; /* on-disk blkno (in BBs) of buffer */ int index; /* relevant index into block */ @@ -182,14 +128,12 @@ typedef struct xfs_da_state_path { typedef struct xfs_da_state { xfs_da_args_t *args; /* filename arguments */ struct xfs_mount *mp; /* filesystem mount point */ - unsigned int blocksize; /* logical block size */ - unsigned int node_ents; /* how many entries in danode */ xfs_da_state_path_t path; /* search/split paths */ xfs_da_state_path_t altpath; /* alternate path for join */ unsigned char inleaf; /* insert into 1->lf, 0->splf */ unsigned char extravalid; /* T/F: extrablk is in use */ unsigned char extraafter; /* T/F: extrablk is after new */ - xfs_da_state_blk_t extrablk; /* for double-splits on leafs */ + xfs_da_state_blk_t extrablk; /* for double-splits on leaves */ /* for dirv2 extrablk is data */ } xfs_da_state_t; @@ -201,66 +145,77 @@ typedef struct xfs_da_state { (uint)(XFS_DA_LOGOFF(BASE, ADDR)), \ (uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1) +/* + * Name ops for directory and/or attr name operations + */ +struct xfs_nameops { + xfs_dahash_t (*hashname)(struct xfs_name *); + enum xfs_dacmp (*compname)(struct xfs_da_args *, + const unsigned char *, int); +}; + -#ifdef __KERNEL__ /*======================================================================== - * Function prototypes for the kernel. + * Function prototypes. *========================================================================*/ /* * Routines used for growing the Btree. */ -int xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level, - xfs_dabuf_t **bpp, int whichfork); -int xfs_da_split(xfs_da_state_t *state); +int xfs_da3_node_create(struct xfs_da_args *args, xfs_dablk_t blkno, + int level, struct xfs_buf **bpp, int whichfork); +int xfs_da3_split(xfs_da_state_t *state); /* * Routines used for shrinking the Btree. */ -int xfs_da_join(xfs_da_state_t *state); -void xfs_da_fixhashpath(xfs_da_state_t *state, - xfs_da_state_path_t *path_to_to_fix); +int xfs_da3_join(xfs_da_state_t *state); +void xfs_da3_fixhashpath(struct xfs_da_state *state, + struct xfs_da_state_path *path_to_to_fix); /* * Routines used for finding things in the Btree. */ -int xfs_da_node_lookup_int(xfs_da_state_t *state, int *result); -int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, +int xfs_da3_node_lookup_int(xfs_da_state_t *state, int *result); +int xfs_da3_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path, int forward, int release, int *result); /* * Utility routines. */ -int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, +int xfs_da3_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk, xfs_da_state_blk_t *new_blk); +int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mappedbno, + struct xfs_buf **bpp, int which_fork); /* * Utility routines. */ int xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno); +int xfs_da_grow_inode_int(struct xfs_da_args *args, xfs_fileoff_t *bno, + int count); int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bp, int whichfork); + struct xfs_buf **bp, int whichfork); int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, - xfs_dabuf_t **bpp, int whichfork); -xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp, - xfs_dablk_t bno, int whichfork); + struct xfs_buf **bpp, int whichfork, + const struct xfs_buf_ops *ops); +xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno, int whichfork, + const struct xfs_buf_ops *ops); int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno, - xfs_dabuf_t *dead_buf); + struct xfs_buf *dead_buf); + +uint xfs_da_hashname(const __uint8_t *name_string, int name_length); +enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args, + const unsigned char *name, int len); + -uint xfs_da_hashname(const uchar_t *name_string, int name_length); -uint xfs_da_log2_roundup(uint i); xfs_da_state_t *xfs_da_state_alloc(void); void xfs_da_state_free(xfs_da_state_t *state); -void xfs_da_buf_done(xfs_dabuf_t *dabuf); -void xfs_da_log_buf(struct xfs_trans *tp, xfs_dabuf_t *dabuf, uint first, - uint last); -void xfs_da_brelse(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -void xfs_da_binval(struct xfs_trans *tp, xfs_dabuf_t *dabuf); -xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf); - extern struct kmem_zone *xfs_da_state_zone; -#endif /* __KERNEL__ */ +extern const struct xfs_nameops xfs_default_nameops; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c new file mode 100644 index 00000000000..c9aee52a37e --- /dev/null +++ b/fs/xfs/xfs_da_format.c @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" + +/* + * Shortform directory ops + */ +static int +xfs_dir2_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ + + count += len; /* name */ + count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : + sizeof(xfs_dir2_ino4_t); /* ino # */ + return count; +} + +static int +xfs_dir3_sf_entsize( + struct xfs_dir2_sf_hdr *hdr, + int len) +{ + return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t); +} + +static struct xfs_dir2_sf_entry * +xfs_dir2_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen)); +} + +static struct xfs_dir2_sf_entry * +xfs_dir3_sf_nextentry( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen)); +} + + +/* + * For filetype enabled shortform directories, the file type field is stored at + * the end of the name. Because it's only a single byte, endian conversion is + * not necessary. For non-filetype enable directories, the type is always + * unknown and we never store the value. + */ +static __uint8_t +xfs_dir2_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_sfe_get_ftype( + struct xfs_dir2_sf_entry *sfep) +{ + __uint8_t ftype; + + ftype = sfep->name[sfep->namelen]; + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_sfe_put_ftype( + struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); + + sfep->name[sfep->namelen] = ftype; +} + +/* + * Inode numbers in short-form directories can come in two versions, + * either 4 bytes or 8 bytes wide. These helpers deal with the + * two forms transparently by looking at the headers i8count field. + * + * For 64-bit inode number the most significant byte must be zero. + */ +static xfs_ino_t +xfs_dir2_sf_get_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *from) +{ + if (hdr->i8count) + return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; + else + return get_unaligned_be32(&from->i4.i); +} + +static void +xfs_dir2_sf_put_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_dir2_inou_t *to, + xfs_ino_t ino) +{ + ASSERT((ino & 0xff00000000000000ULL) == 0); + + if (hdr->i8count) + put_unaligned_be64(ino, &to->i8.i); + else + put_unaligned_be32(ino, &to->i4.i); +} + +static xfs_ino_t +xfs_dir2_sf_get_parent_ino( + struct xfs_dir2_sf_hdr *hdr) +{ + return xfs_dir2_sf_get_ino(hdr, &hdr->parent); +} + +static void +xfs_dir2_sf_put_parent_ino( + struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); +} + +/* + * In short-form directory entries the inode numbers are stored at variable + * offset behind the entry name. If the entry stores a filetype value, then it + * sits between the name and the inode number. Hence the inode numbers may only + * be accessed through the helpers below. + */ +static xfs_ino_t +xfs_dir2_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]); +} + +static void +xfs_dir2_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino); +} + +static xfs_ino_t +xfs_dir3_sfe_get_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep) +{ + return xfs_dir2_sf_get_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]); +} + +static void +xfs_dir3_sfe_put_ino( + struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino) +{ + xfs_dir2_sf_put_ino(hdr, + (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino); +} + + +/* + * Directory data block operations + */ + +/* + * For special situations, the dirent size ends up fixed because we always know + * what the size of the entry is. That's true for the "." and "..", and + * therefore we know that they are a fixed size and hence their offsets are + * constant, as is the first entry. + * + * Hence, this calculation is written as a macro to be able to be calculated at + * compile time and so certain offsets can be calculated directly in the + * structure initaliser via the macro. There are two macros - one for dirents + * with ftype and without so there are no unresolvable conditionals in the + * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power + * of 2 and the compiler doesn't reject it (unlike roundup()). + */ +#define XFS_DIR2_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN) + +#define XFS_DIR3_DATA_ENTSIZE(n) \ + round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \ + sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \ + XFS_DIR2_DATA_ALIGN) + +static int +xfs_dir2_data_entsize( + int n) +{ + return XFS_DIR2_DATA_ENTSIZE(n); +} + +static int +xfs_dir3_data_entsize( + int n) +{ + return XFS_DIR3_DATA_ENTSIZE(n); +} + +static __uint8_t +xfs_dir2_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + return XFS_DIR3_FT_UNKNOWN; +} + +static void +xfs_dir2_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t ftype) +{ + ASSERT(ftype < XFS_DIR3_FT_MAX); +} + +static __uint8_t +xfs_dir3_data_get_ftype( + struct xfs_dir2_data_entry *dep) +{ + __uint8_t ftype = dep->name[dep->namelen]; + + ASSERT(ftype < XFS_DIR3_FT_MAX); + if (ftype >= XFS_DIR3_FT_MAX) + return XFS_DIR3_FT_UNKNOWN; + return ftype; +} + +static void +xfs_dir3_data_put_ftype( + struct xfs_dir2_data_entry *dep, + __uint8_t type) +{ + ASSERT(type < XFS_DIR3_FT_MAX); + ASSERT(dep->namelen != 0); + + dep->name[dep->namelen] = type; +} + +/* + * Pointer to an entry's tag word. + */ +static __be16 * +xfs_dir2_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16)); +} + +static __be16 * +xfs_dir3_data_entry_tag_p( + struct xfs_dir2_data_entry *dep) +{ + return (__be16 *)((char *)dep + + xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16)); +} + +/* + * location of . and .. in data space (always block 0) + */ +static struct xfs_dir2_data_entry * +xfs_dir2_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir2_ftype_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_dotdot_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_first_entry_p( + struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2)); +} + +static struct xfs_dir2_data_free * +xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return hdr->bestfree; +} + +static struct xfs_dir2_data_free * +xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir3_data_hdr *)hdr)->best_free; +} + +static struct xfs_dir2_data_entry * +xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir2_data_hdr)); +} + +static struct xfs_dir2_data_entry * +xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_entry *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + +static struct xfs_dir2_data_unused * +xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr) +{ + return (struct xfs_dir2_data_unused *) + ((char *)hdr + sizeof(struct xfs_dir3_data_hdr)); +} + + +/* + * Directory Leaf block operations + */ +static int +xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return lp->__ents; +} + +static int +xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) / + (uint)sizeof(struct xfs_dir2_leaf_entry); +} + +static struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp) +{ + return ((struct xfs_dir3_leaf *)lp)->__ents; +} + +static void +xfs_dir2_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.count); + to->stale = be16_to_cpu(from->hdr.stale); + + ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC || + to->magic == XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir2_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC || + from->magic == XFS_DIR2_LEAFN_MAGIC); + + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.count = cpu_to_be16(from->count); + to->hdr.stale = cpu_to_be16(from->stale); +} + +static void +xfs_dir3_leaf_hdr_from_disk( + struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from; + + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->count); + to->stale = be16_to_cpu(hdr3->stale); + + ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC || + to->magic == XFS_DIR3_LEAFN_MAGIC); +} + +static void +xfs_dir3_leaf_hdr_to_disk( + struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from) +{ + struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || + from->magic == XFS_DIR3_LEAFN_MAGIC); + + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->count = cpu_to_be16(from->count); + hdr3->stale = cpu_to_be16(from->stale); +} + + +/* + * Directory/Attribute Node block operations + */ +static struct xfs_da_node_entry * +xfs_da2_node_tree_p(struct xfs_da_intnode *dap) +{ + return dap->__btree; +} + +static struct xfs_da_node_entry * +xfs_da3_node_tree_p(struct xfs_da_intnode *dap) +{ + return ((struct xfs_da3_intnode *)dap)->__btree; +} + +static void +xfs_da2_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC)); + to->forw = be32_to_cpu(from->hdr.info.forw); + to->back = be32_to_cpu(from->hdr.info.back); + to->magic = be16_to_cpu(from->hdr.info.magic); + to->count = be16_to_cpu(from->hdr.__count); + to->level = be16_to_cpu(from->hdr.__level); +} + +static void +xfs_da2_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + ASSERT(from->magic == XFS_DA_NODE_MAGIC); + to->hdr.info.forw = cpu_to_be32(from->forw); + to->hdr.info.back = cpu_to_be32(from->back); + to->hdr.info.magic = cpu_to_be16(from->magic); + to->hdr.__count = cpu_to_be16(from->count); + to->hdr.__level = cpu_to_be16(from->level); +} + +static void +xfs_da3_node_hdr_from_disk( + struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from; + + ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)); + to->forw = be32_to_cpu(hdr3->info.hdr.forw); + to->back = be32_to_cpu(hdr3->info.hdr.back); + to->magic = be16_to_cpu(hdr3->info.hdr.magic); + to->count = be16_to_cpu(hdr3->__count); + to->level = be16_to_cpu(hdr3->__level); +} + +static void +xfs_da3_node_hdr_to_disk( + struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from) +{ + struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to; + + ASSERT(from->magic == XFS_DA3_NODE_MAGIC); + hdr3->info.hdr.forw = cpu_to_be32(from->forw); + hdr3->info.hdr.back = cpu_to_be32(from->back); + hdr3->info.hdr.magic = cpu_to_be16(from->magic); + hdr3->__count = cpu_to_be16(from->count); + hdr3->__level = cpu_to_be16(from->level); +} + + +/* + * Directory free space block operations + */ +static int +xfs_dir2_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir2_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir2_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir2_free_max_bests(geo); +} + +static int +xfs_dir3_free_max_bests(struct xfs_da_geometry *geo) +{ + return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) / + sizeof(xfs_dir2_data_off_t); +} + +static __be16 * +xfs_dir3_free_bests_p(struct xfs_dir2_free *free) +{ + return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr)); +} + +/* + * Convert data space db to the corresponding free db. + */ +static xfs_dir2_db_t +xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) + + (db / xfs_dir3_free_max_bests(geo)); +} + +/* + * Convert data space db to the corresponding index in a free db. + */ +static int +xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return db % xfs_dir3_free_max_bests(geo); +} + +static void +xfs_dir2_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + to->magic = be32_to_cpu(from->hdr.magic); + to->firstdb = be32_to_cpu(from->hdr.firstdb); + to->nvalid = be32_to_cpu(from->hdr.nvalid); + to->nused = be32_to_cpu(from->hdr.nused); + ASSERT(to->magic == XFS_DIR2_FREE_MAGIC); +} + +static void +xfs_dir2_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + ASSERT(from->magic == XFS_DIR2_FREE_MAGIC); + + to->hdr.magic = cpu_to_be32(from->magic); + to->hdr.firstdb = cpu_to_be32(from->firstdb); + to->hdr.nvalid = cpu_to_be32(from->nvalid); + to->hdr.nused = cpu_to_be32(from->nused); +} + +static void +xfs_dir3_free_hdr_from_disk( + struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from; + + to->magic = be32_to_cpu(hdr3->hdr.magic); + to->firstdb = be32_to_cpu(hdr3->firstdb); + to->nvalid = be32_to_cpu(hdr3->nvalid); + to->nused = be32_to_cpu(hdr3->nused); + + ASSERT(to->magic == XFS_DIR3_FREE_MAGIC); +} + +static void +xfs_dir3_free_hdr_to_disk( + struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from) +{ + struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to; + + ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); + + hdr3->hdr.magic = cpu_to_be32(from->magic); + hdr3->firstdb = cpu_to_be32(from->firstdb); + hdr3->nvalid = cpu_to_be32(from->nvalid); + hdr3->nused = cpu_to_be32(from->nused); +} + +static const struct xfs_dir_ops xfs_dir2_ops = { + .sf_entsize = xfs_dir2_sf_entsize, + .sf_nextentry = xfs_dir2_sf_nextentry, + .sf_get_ftype = xfs_dir2_sfe_get_ftype, + .sf_put_ftype = xfs_dir2_sfe_put_ftype, + .sf_get_ino = xfs_dir2_sfe_get_ino, + .sf_put_ino = xfs_dir2_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir2_data_entsize, + .data_get_ftype = xfs_dir2_data_get_ftype, + .data_put_ftype = xfs_dir2_data_put_ftype, + .data_entry_tag_p = xfs_dir2_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR2_DATA_ENTSIZE(1) + + XFS_DIR2_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_ftype_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir2_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir2_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir2_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir2_data_hdr), + + .data_dot_entry_p = xfs_dir2_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p, + .data_entry_p = xfs_dir2_data_entry_p, + .data_unused_p = xfs_dir2_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir2_max_leaf_ents, + .leaf_ents_p = xfs_dir2_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir2_free_hdr), + .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk, + .free_max_bests = xfs_dir2_free_max_bests, + .free_bests_p = xfs_dir2_free_bests_p, + .db_to_fdb = xfs_dir2_db_to_fdb, + .db_to_fdindex = xfs_dir2_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir3_ops = { + .sf_entsize = xfs_dir3_sf_entsize, + .sf_nextentry = xfs_dir3_sf_nextentry, + .sf_get_ftype = xfs_dir3_sfe_get_ftype, + .sf_put_ftype = xfs_dir3_sfe_put_ftype, + .sf_get_ino = xfs_dir3_sfe_get_ino, + .sf_put_ino = xfs_dir3_sfe_put_ino, + .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino, + .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino, + + .data_entsize = xfs_dir3_data_entsize, + .data_get_ftype = xfs_dir3_data_get_ftype, + .data_put_ftype = xfs_dir3_data_put_ftype, + .data_entry_tag_p = xfs_dir3_data_entry_tag_p, + .data_bestfree_p = xfs_dir3_data_bestfree_p, + + .data_dot_offset = sizeof(struct xfs_dir3_data_hdr), + .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1), + .data_first_offset = sizeof(struct xfs_dir3_data_hdr) + + XFS_DIR3_DATA_ENTSIZE(1) + + XFS_DIR3_DATA_ENTSIZE(2), + .data_entry_offset = sizeof(struct xfs_dir3_data_hdr), + + .data_dot_entry_p = xfs_dir3_data_dot_entry_p, + .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p, + .data_first_entry_p = xfs_dir3_data_first_entry_p, + .data_entry_p = xfs_dir3_data_entry_p, + .data_unused_p = xfs_dir3_data_unused_p, + + .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr), + .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk, + .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk, + .leaf_max_ents = xfs_dir3_max_leaf_ents, + .leaf_ents_p = xfs_dir3_leaf_ents_p, + + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, + + .free_hdr_size = sizeof(struct xfs_dir3_free_hdr), + .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk, + .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk, + .free_max_bests = xfs_dir3_free_max_bests, + .free_bests_p = xfs_dir3_free_bests_p, + .db_to_fdb = xfs_dir3_db_to_fdb, + .db_to_fdindex = xfs_dir3_db_to_fdindex, +}; + +static const struct xfs_dir_ops xfs_dir2_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da_node_hdr), + .node_hdr_to_disk = xfs_da2_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da2_node_hdr_from_disk, + .node_tree_p = xfs_da2_node_tree_p, +}; + +static const struct xfs_dir_ops xfs_dir3_nondir_ops = { + .node_hdr_size = sizeof(struct xfs_da3_node_hdr), + .node_hdr_to_disk = xfs_da3_node_hdr_to_disk, + .node_hdr_from_disk = xfs_da3_node_hdr_from_disk, + .node_tree_p = xfs_da3_node_tree_p, +}; + +/* + * Return the ops structure according to the current config. If we are passed + * an inode, then that overrides the default config we use which is based on + * feature bits. + */ +const struct xfs_dir_ops * +xfs_dir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_dir_inode_ops) + return mp->m_dir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_ops; + if (xfs_sb_version_hasftype(&mp->m_sb)) + return &xfs_dir2_ftype_ops; + return &xfs_dir2_ops; +} + +const struct xfs_dir_ops * +xfs_nondir_get_ops( + struct xfs_mount *mp, + struct xfs_inode *dp) +{ + if (dp) + return dp->d_ops; + if (mp->m_nondir_inode_ops) + return mp->m_nondir_inode_ops; + if (xfs_sb_version_hascrc(&mp->m_sb)) + return &xfs_dir3_nondir_ops; + return &xfs_dir2_nondir_ops; +} diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/xfs_da_format.h new file mode 100644 index 00000000000..0a49b028637 --- /dev/null +++ b/fs/xfs/xfs_da_format.h @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DA_FORMAT_H__ +#define __XFS_DA_FORMAT_H__ + +/* + * This structure is common to both leaf nodes and non-leaf nodes in the Btree. + * + * It is used to manage a doubly linked list of all blocks at the same + * level in the Btree, and to identify which type of block this is. + */ +#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ +#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ + +typedef struct xfs_da_blkinfo { + __be32 forw; /* previous block in list */ + __be32 back; /* following block in list */ + __be16 magic; /* validity check on block */ + __be16 pad; /* unused */ +} xfs_da_blkinfo_t; + +/* + * CRC enabled directory structure types + * + * The headers change size for the additional verification information, but + * otherwise the tree layouts and contents are unchanged. Hence the da btree + * code can use the struct xfs_da_blkinfo for manipulating the tree links and + * magic numbers without modification for both v2 and v3 nodes. + */ +#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ +#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ + +struct xfs_da3_blkinfo { + /* + * the node link manipulation code relies on the fact that the first + * element of this structure is the struct xfs_da_blkinfo so it can + * ignore the differences in the rest of the structures. + */ + struct xfs_da_blkinfo hdr; + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +/* + * This is the structure of the root and intermediate nodes in the Btree. + * The leaf nodes are defined above. + * + * Entries are not packed. + * + * Since we have duplicate keys, use a binary search but always follow + * all match in the block, not just the first match found. + */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ + +typedef struct xfs_da_node_hdr { + struct xfs_da_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ +} xfs_da_node_hdr_t; + +struct xfs_da3_node_hdr { + struct xfs_da3_blkinfo info; /* block type, links, etc. */ + __be16 __count; /* count of active entries */ + __be16 __level; /* level above leaves (leaf == 0) */ + __be32 __pad32; +}; + +#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc)) + +typedef struct xfs_da_node_entry { + __be32 hashval; /* hash value for this descendant */ + __be32 before; /* Btree block before this key */ +} xfs_da_node_entry_t; + +typedef struct xfs_da_intnode { + struct xfs_da_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +} xfs_da_intnode_t; + +struct xfs_da3_intnode { + struct xfs_da3_node_hdr hdr; + struct xfs_da_node_entry __btree[]; +}; + +/* + * In-core version of the node header to abstract the differences in the v2 and + * v3 disk format of the headers. Callers need to convert to/from disk format as + * appropriate. + */ +struct xfs_da3_icnode_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t level; +}; + +/* + * Directory version 2. + * + * There are 4 possible formats: + * - shortform - embedded into the inode + * - single block - data with embedded leaf at the end + * - multiple data blocks, single leaf+freeindex block + * - data blocks, node and leaf blocks (btree), freeindex blocks + * + * Note: many node blocks structures and constants are shared with the attr + * code and defined in xfs_da_btree.h. + */ + +#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: single block dirs */ +#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: multiblock dirs */ +#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F: free index blocks */ + +/* + * Directory Version 3 With CRCs. + * + * The tree formats are the same as for version 2 directories. The difference + * is in the block header and dirent formats. In many cases the v3 structures + * use v2 definitions as they are no different and this makes code sharing much + * easier. + * + * Also, the xfs_dir3_*() functions handle both v2 and v3 formats - if the + * format is v2 then they switch to the existing v2 code, or the format is v3 + * they implement the v3 functionality. This means the existing dir2 is a mix of + * xfs_dir2/xfs_dir3 calls and functions. The xfs_dir3 functions are called + * where there is a difference in the formats, otherwise the code is unchanged. + * + * Where it is possible, the code decides what to do based on the magic numbers + * in the blocks rather than feature bits in the superblock. This means the code + * is as independent of the external XFS code as possible as doesn't require + * passing struct xfs_mount pointers into places where it isn't really + * necessary. + * + * Version 3 includes: + * + * - a larger block header for CRC and identification purposes and so the + * offsets of all the structures inside the blocks are different. + * + * - new magic numbers to be able to detect the v2/v3 types on the fly. + */ + +#define XFS_DIR3_BLOCK_MAGIC 0x58444233 /* XDB3: single block dirs */ +#define XFS_DIR3_DATA_MAGIC 0x58444433 /* XDD3: multiblock dirs */ +#define XFS_DIR3_FREE_MAGIC 0x58444633 /* XDF3: free index blocks */ + +/* + * Dirents in version 3 directories have a file type field. Additions to this + * list are an on-disk format change, requiring feature bits. Valid values + * are as follows: + */ +#define XFS_DIR3_FT_UNKNOWN 0 +#define XFS_DIR3_FT_REG_FILE 1 +#define XFS_DIR3_FT_DIR 2 +#define XFS_DIR3_FT_CHRDEV 3 +#define XFS_DIR3_FT_BLKDEV 4 +#define XFS_DIR3_FT_FIFO 5 +#define XFS_DIR3_FT_SOCK 6 +#define XFS_DIR3_FT_SYMLINK 7 +#define XFS_DIR3_FT_WHT 8 + +#define XFS_DIR3_FT_MAX 9 + +/* + * Byte offset in data block and shortform entry. + */ +typedef __uint16_t xfs_dir2_data_off_t; +#define NULLDATAOFF 0xffffU +typedef uint xfs_dir2_data_aoff_t; /* argument form */ + +/* + * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. + * Only need 16 bits, this is the byte offset into the single block form. + */ +typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t; + +/* + * Offset in data space of a data entry. + */ +typedef __uint32_t xfs_dir2_dataptr_t; +#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) +#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) + +/* + * Byte offset in a directory. + */ +typedef xfs_off_t xfs_dir2_off_t; + +/* + * Directory block number (logical dirblk in file) + */ +typedef __uint32_t xfs_dir2_db_t; + +/* + * Inode number stored as 8 8-bit values. + */ +typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; + +/* + * Inode number stored as 4 8-bit values. + * Works a lot of the time, when all the inode numbers in a directory + * fit in 32 bits. + */ +typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; + +typedef union { + xfs_dir2_ino8_t i8; + xfs_dir2_ino4_t i4; +} xfs_dir2_inou_t; +#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) + +/* + * Directory layout when stored internal to an inode. + * + * Small directories are packed as tightly as possible so as to fit into the + * literal area of the inode. These "shortform" directories consist of a + * single xfs_dir2_sf_hdr header followed by zero or more xfs_dir2_sf_entry + * structures. Due the different inode number storage size and the variable + * length name field in the xfs_dir2_sf_entry all these structure are + * variable length, and the accessors in this file should be used to iterate + * over them. + */ +typedef struct xfs_dir2_sf_hdr { + __uint8_t count; /* count of entries */ + __uint8_t i8count; /* count of 8-byte inode #s */ + xfs_dir2_inou_t parent; /* parent dir inode number */ +} __arch_pack xfs_dir2_sf_hdr_t; + +typedef struct xfs_dir2_sf_entry { + __u8 namelen; /* actual name length */ + xfs_dir2_sf_off_t offset; /* saved offset */ + __u8 name[]; /* name, variable size */ + /* + * A single byte containing the file type field follows the inode + * number for version 3 directory entries. + * + * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a + * variable offset after the name. + */ +} __arch_pack xfs_dir2_sf_entry_t; + +static inline int xfs_dir2_sf_hdr_size(int i8count) +{ + return sizeof(struct xfs_dir2_sf_hdr) - + (i8count == 0) * + (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t)); +} + +static inline xfs_dir2_data_aoff_t +xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) +{ + return get_unaligned_be16(&sfep->offset.i); +} + +static inline void +xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) +{ + put_unaligned_be16(off, &sfep->offset.i); +} + +static inline struct xfs_dir2_sf_entry * +xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr) +{ + return (struct xfs_dir2_sf_entry *) + ((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count)); +} + +/* + * Data block structures. + * + * A pure data block looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + * + * In addition to the pure data blocks for the data and node formats, + * most structures are also used for the combined data/freespace "block" + * format below. + */ + +#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ +#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) +#define XFS_DIR2_DATA_FREE_TAG 0xffff +#define XFS_DIR2_DATA_FD_COUNT 3 + +/* + * Directory address space divided into sections, + * spaces separated by 32GB. + */ +#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) +#define XFS_DIR2_DATA_SPACE 0 +#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Describe a free area in the data block. + * + * The freespace will be formatted as a xfs_dir2_data_unused_t. + */ +typedef struct xfs_dir2_data_free { + __be16 offset; /* start of freespace */ + __be16 length; /* length of freespace */ +} xfs_dir2_data_free_t; + +/* + * Header for the data blocks. + * + * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. + */ +typedef struct xfs_dir2_data_hdr { + __be32 magic; /* XFS_DIR2_DATA_MAGIC or */ + /* XFS_DIR2_BLOCK_MAGIC */ + xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; +} xfs_dir2_data_hdr_t; + +/* + * define a structure for all the verification fields we are adding to the + * directory block structures. This will be used in several structures. + * The magic number must be the first entry to align with all the dir2 + * structures so we determine how to decode them just by the magic number. + */ +struct xfs_dir3_blk_hdr { + __be32 magic; /* magic number */ + __be32 crc; /* CRC of block */ + __be64 blkno; /* first block of the buffer */ + __be64 lsn; /* sequence number of last write */ + uuid_t uuid; /* filesystem we belong to */ + __be64 owner; /* inode that owns the block */ +}; + +struct xfs_dir3_data_hdr { + struct xfs_dir3_blk_hdr hdr; + xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ +}; + +#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) + +/* + * Active entry in a data block. + * + * Aligned to 8 bytes. After the variable length name field there is a + * 2 byte tag field, which can be accessed using xfs_dir3_data_entry_tag_p. + * + * For dir3 structures, there is file type field between the name and the tag. + * This can only be manipulated by helper functions. It is packed hard against + * the end of the name so any padding for rounding is between the file type and + * the tag. + */ +typedef struct xfs_dir2_data_entry { + __be64 inumber; /* inode number */ + __u8 namelen; /* name length */ + __u8 name[]; /* name bytes, no null */ + /* __u8 filetype; */ /* type of inode we point to */ + /* __be16 tag; */ /* starting offset of us */ +} xfs_dir2_data_entry_t; + +/* + * Unused entry in a data block. + * + * Aligned to 8 bytes. Tag appears as the last 2 bytes and must be accessed + * using xfs_dir2_data_unused_tag_p. + */ +typedef struct xfs_dir2_data_unused { + __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ + __be16 length; /* total free length */ + /* variable offset */ + __be16 tag; /* starting offset of us */ +} xfs_dir2_data_unused_t; + +/* + * Pointer to a freespace's tag word. + */ +static inline __be16 * +xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup) +{ + return (__be16 *)((char *)dup + + be16_to_cpu(dup->length) - sizeof(__be16)); +} + +/* + * Leaf block structures. + * + * A pure leaf block looks like the following drawing on disk: + * + * +---------------------------+ + * | xfs_dir2_leaf_hdr_t | + * +---------------------------+ + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | xfs_dir2_data_off_t | + * | ... | + * +---------------------------+ + * | xfs_dir2_leaf_tail_t | + * +---------------------------+ + * + * The xfs_dir2_data_off_t members (bests) and tail are at the end of the block + * for single-leaf (magic = XFS_DIR2_LEAF1_MAGIC) blocks only, but not present + * for directories with separate leaf nodes and free space blocks + * (magic = XFS_DIR2_LEAFN_MAGIC). + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +/* + * Offset of the leaf/node space. First block in this space + * is the btree root. + */ +#define XFS_DIR2_LEAF_SPACE 1 +#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) + +/* + * Leaf block header. + */ +typedef struct xfs_dir2_leaf_hdr { + xfs_da_blkinfo_t info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ +} xfs_dir2_leaf_hdr_t; + +struct xfs_dir3_leaf_hdr { + struct xfs_da3_blkinfo info; /* header for da routines */ + __be16 count; /* count of entries */ + __be16 stale; /* count of stale entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t stale; +}; + +/* + * Leaf block entry. + */ +typedef struct xfs_dir2_leaf_entry { + __be32 hashval; /* hash value of name */ + __be32 address; /* address of data entry */ +} xfs_dir2_leaf_entry_t; + +/* + * Leaf block tail. + */ +typedef struct xfs_dir2_leaf_tail { + __be32 bestcount; +} xfs_dir2_leaf_tail_t; + +/* + * Leaf block. + */ +typedef struct xfs_dir2_leaf { + xfs_dir2_leaf_hdr_t hdr; /* leaf header */ + xfs_dir2_leaf_entry_t __ents[]; /* entries */ +} xfs_dir2_leaf_t; + +struct xfs_dir3_leaf { + struct xfs_dir3_leaf_hdr hdr; /* leaf header */ + struct xfs_dir2_leaf_entry __ents[]; /* entries */ +}; + +#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc) + +/* + * Get address of the bests array in the single-leaf block. + */ +static inline __be16 * +xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp) +{ + return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); +} + +/* + * Free space block defintions for the node format. + */ + +/* + * Offset of the freespace index. + */ +#define XFS_DIR2_FREE_SPACE 2 +#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) + +typedef struct xfs_dir2_free_hdr { + __be32 magic; /* XFS_DIR2_FREE_MAGIC */ + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ +} xfs_dir2_free_hdr_t; + +typedef struct xfs_dir2_free { + xfs_dir2_free_hdr_t hdr; /* block header */ + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +} xfs_dir2_free_t; + +struct xfs_dir3_free_hdr { + struct xfs_dir3_blk_hdr hdr; + __be32 firstdb; /* db of first entry */ + __be32 nvalid; /* count of valid entries */ + __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ +}; + +struct xfs_dir3_free { + struct xfs_dir3_free_hdr hdr; + __be16 bests[]; /* best free counts */ + /* unused entries are -1 */ +}; + +#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc) + +/* + * In core version of the free block header, abstracted away from on-disk format + * differences. Use this in the code, and convert to/from the disk version using + * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk. + */ +struct xfs_dir3_icfree_hdr { + __uint32_t magic; + __uint32_t firstdb; + __uint32_t nvalid; + __uint32_t nused; + +}; + +/* + * Single block format. + * + * The single block format looks like the following drawing on disk: + * + * +-------------------------------------------------+ + * | xfs_dir2_data_hdr_t | + * +-------------------------------------------------+ + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t | + * | xfs_dir2_data_entry_t OR xfs_dir2_data_unused_t : + * | ... | + * +-------------------------------------------------+ + * | unused space | + * +-------------------------------------------------+ + * | ... | + * | xfs_dir2_leaf_entry_t | + * | xfs_dir2_leaf_entry_t | + * +-------------------------------------------------+ + * | xfs_dir2_block_tail_t | + * +-------------------------------------------------+ + * + * As all the entries are variable size structures the accessors below should + * be used to iterate over them. + */ + +typedef struct xfs_dir2_block_tail { + __be32 count; /* count of leaf entries */ + __be32 stale; /* count of stale lf entries */ +} xfs_dir2_block_tail_t; + +/* + * Pointer to the leaf entries embedded in a data block (1-block format) + */ +static inline struct xfs_dir2_leaf_entry * +xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp) +{ + return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); +} + + +/* + * Attribute storage layout + * + * Attribute lists are structured around Btrees where all the data + * elements are in the leaf nodes. Attribute names are hashed into an int, + * then that int is used as the index into the Btree. Since the hashval + * of an attribute name may not be unique, we may have duplicate keys. The + * internal links in the Btree are logical block offsets into the file. + * + * Struct leaf_entry's are packed from the top. Name/values grow from the + * bottom but are not packed. The freemap contains run-length-encoded entries + * for the free bytes after the leaf_entry's, but only the N largest such, + * smaller runs are dropped. When the freemap doesn't show enough space + * for an allocation, we compact the name/value area and try again. If we + * still don't have enough space, then we have to split the block. The + * name/value structs (both local and remote versions) must be 32bit aligned. + * + * Since we have duplicate hash keys, for each key that matches, compare + * the actual name string. The root and intermediate node search always + * takes the first-in-the-block key match found, so we should only have + * to work "forw"ard. If none matches, continue with the "forw"ard leaf + * nodes until the hash key changes or the attribute name is found. + * + * We store the fact that an attribute is a ROOT/USER/SECURE attribute in + * the leaf_entry. The namespaces are independent only because we also look + * at the namespace bit when we are looking for a matching attribute name. + * + * We also store an "incomplete" bit in the leaf_entry. It shows that an + * attribute is in the middle of being created and should not be shown to + * the user if we crash during the time that the bit is set. We clear the + * bit when we have finished setting up the attribute. We do this because + * we cannot create some large attributes inside a single transaction, and we + * need some indication that we weren't finished if we crash in the middle. + */ +#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */ + +typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ + __be16 base; /* base of free region */ + __be16 size; /* length of free region */ +} xfs_attr_leaf_map_t; + +typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */ + xfs_da_blkinfo_t info; /* block type, links, etc. */ + __be16 count; /* count of active leaf_entry's */ + __be16 usedbytes; /* num bytes of names/values stored */ + __be16 firstused; /* first used byte in name area */ + __u8 holes; /* != 0 if blk needs compaction */ + __u8 pad1; + xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE]; + /* N largest free regions */ +} xfs_attr_leaf_hdr_t; + +typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ + __be32 hashval; /* hash value of name */ + __be16 nameidx; /* index into buffer of name/value */ + __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */ + __u8 pad2; /* unused pad byte */ +} xfs_attr_leaf_entry_t; + +typedef struct xfs_attr_leaf_name_local { + __be16 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 nameval[1]; /* name/value bytes */ +} xfs_attr_leaf_name_local_t; + +typedef struct xfs_attr_leaf_name_remote { + __be32 valueblk; /* block number of value bytes */ + __be32 valuelen; /* number of bytes in value */ + __u8 namelen; /* length of name bytes */ + __u8 name[1]; /* name bytes */ +} xfs_attr_leaf_name_remote_t; + +typedef struct xfs_attr_leafblock { + xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ + xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */ + xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */ +} xfs_attr_leafblock_t; + +/* + * CRC enabled leaf structures. Called "version 3" structures to match the + * version number of the directory and dablk structures for this feature, and + * attr2 is already taken by the variable inode attribute fork size feature. + */ +struct xfs_attr3_leaf_hdr { + struct xfs_da3_blkinfo info; + __be16 count; + __be16 usedbytes; + __be16 firstused; + __u8 holes; + __u8 pad1; + struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ +}; + +#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) + +struct xfs_attr3_leafblock { + struct xfs_attr3_leaf_hdr hdr; + struct xfs_attr_leaf_entry entries[1]; + + /* + * The rest of the block contains the following structures after the + * leaf entries, growing from the bottom up. The variables are never + * referenced, the locations accessed purely from helper functions. + * + * struct xfs_attr_leaf_name_local + * struct xfs_attr_leaf_name_remote + */ +}; + +/* + * incore, neutral version of the attribute leaf header + */ +struct xfs_attr3_icleaf_hdr { + __uint32_t forw; + __uint32_t back; + __uint16_t magic; + __uint16_t count; + __uint16_t usedbytes; + __uint16_t firstused; + __u8 holes; + struct { + __uint16_t base; + __uint16_t size; + } freemap[XFS_ATTR_LEAF_MAPSIZE]; +}; + +/* + * Flags used in the leaf_entry[i].flags field. + * NOTE: the INCOMPLETE bit must not collide with the flags bits specified + * on the system call, they are "or"ed together for various operations. + */ +#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */ +#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */ +#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */ +#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */ +#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT) +#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT) +#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT) +#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT) + +/* + * Conversion macros for converting namespace bits from argument flags + * to ondisk flags. + */ +#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) +#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK) +#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK) +#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\ + ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0)) +#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\ + ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0)) + +/* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) + */ +#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t)) + +static inline int +xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return sizeof(struct xfs_attr3_leaf_hdr); + return sizeof(struct xfs_attr_leaf_hdr); +} + +static inline struct xfs_attr_leaf_entry * +xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp) +{ + if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) + return &((struct xfs_attr3_leafblock *)leafp)->entries[0]; + return &leafp->entries[0]; +} + +/* + * Cast typed pointers for "local" and "remote" name/value structs. + */ +static inline char * +xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx) +{ + struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp); + + return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)]; +} + +static inline xfs_attr_leaf_name_remote_t * +xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx); +} + +static inline xfs_attr_leaf_name_local_t * +xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) +{ + return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx); +} + +/* + * Calculate total bytes used (including trailing pad for alignment) for + * a "local" name/value structure, a "remote" name/value structure, and + * a pointer which might be either. + */ +static inline int xfs_attr_leaf_entsize_remote(int nlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) +{ + return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + + XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); +} + +static inline int xfs_attr_leaf_entsize_local_max(int bsize) +{ + return (((bsize) >> 1) + ((bsize) >> 2)); +} + + + +/* + * Remote attribute block format definition + * + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ +#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ + +struct xfs_attr3_rmt_hdr { + __be32 rm_magic; + __be32 rm_offset; + __be32 rm_bytes; + __be32 rm_crc; + uuid_t rm_uuid; + __be64 rm_owner; + __be64 rm_blkno; + __be64 rm_lsn; +}; + +#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) + +#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_attr3_rmt_hdr) : 0)) + +#endif /* __XFS_DA_FORMAT_H__ */ diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c deleted file mode 100644 index 80562b60fb9..00000000000 --- a/fs/xfs/xfs_dfrag.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_dfrag.h" -#include "xfs_error.h" -#include "xfs_mac.h" -#include "xfs_rw.h" - -/* - * Syssgi interface for swapext - */ -int -xfs_swapext( - xfs_swapext_t __user *sxu) -{ - xfs_swapext_t *sxp; - xfs_inode_t *ip=NULL, *tip=NULL; - xfs_mount_t *mp; - struct file *fp = NULL, *tfp = NULL; - bhv_vnode_t *vp, *tvp; - int error = 0; - - sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL); - if (!sxp) { - error = XFS_ERROR(ENOMEM); - goto error0; - } - - if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) { - error = XFS_ERROR(EFAULT); - goto error0; - } - - /* Pull information for the target fd */ - if (((fp = fget((int)sxp->sx_fdtarget)) == NULL) || - ((vp = vn_from_inode(fp->f_dentry->d_inode)) == NULL)) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - ip = xfs_vtoi(vp); - if (ip == NULL) { - error = XFS_ERROR(EBADF); - goto error0; - } - - if (((tfp = fget((int)sxp->sx_fdtmp)) == NULL) || - ((tvp = vn_from_inode(tfp->f_dentry->d_inode)) == NULL)) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - tip = xfs_vtoi(tvp); - if (tip == NULL) { - error = XFS_ERROR(EBADF); - goto error0; - } - - if (ip->i_mount != tip->i_mount) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - if (ip->i_ino == tip->i_ino) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) { - error = XFS_ERROR(EIO); - goto error0; - } - - error = XFS_SWAP_EXTENTS(mp, &ip->i_iocore, &tip->i_iocore, sxp); - - error0: - if (fp != NULL) - fput(fp); - if (tfp != NULL) - fput(tfp); - - if (sxp != NULL) - kmem_free(sxp, sizeof(xfs_swapext_t)); - - return error; -} - -int -xfs_swap_extents( - xfs_inode_t *ip, - xfs_inode_t *tip, - xfs_swapext_t *sxp) -{ - xfs_mount_t *mp; - xfs_inode_t *ips[2]; - xfs_trans_t *tp; - xfs_bstat_t *sbp = &sxp->sx_stat; - bhv_vnode_t *vp, *tvp; - xfs_ifork_t *tempifp, *ifp, *tifp; - int ilf_fields, tilf_fields; - static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL; - int error = 0; - int aforkblks = 0; - int taforkblks = 0; - __uint64_t tmp; - char locked = 0; - - mp = ip->i_mount; - - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); - if (!tempifp) { - error = XFS_ERROR(ENOMEM); - goto error0; - } - - sbp = &sxp->sx_stat; - vp = XFS_ITOV(ip); - tvp = XFS_ITOV(tip); - - /* Lock in i_ino order */ - if (ip->i_ino < tip->i_ino) { - ips[0] = ip; - ips[1] = tip; - } else { - ips[0] = tip; - ips[1] = ip; - } - - xfs_lock_inodes(ips, 2, 0, lock_flags); - locked = 1; - - /* Check permissions */ - error = xfs_iaccess(ip, S_IWUSR, NULL); - if (error) - goto error0; - - error = xfs_iaccess(tip, S_IWUSR, NULL); - if (error) - goto error0; - - /* Verify that both files have the same format */ - if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - /* Verify both files are either real-time or non-realtime */ - if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != - (tip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - if (VN_CACHED(tvp) != 0) { - xfs_inval_cached_trace(&tip->i_iocore, 0, -1, 0, -1); - bhv_vop_flushinval_pages(tvp, 0, -1, FI_REMAPF_LOCKED); - } - - /* Verify O_DIRECT for ftmp */ - if (VN_CACHED(tvp) != 0) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - /* Verify all data are being swapped */ - if (sxp->sx_offset != 0 || - sxp->sx_length != ip->i_d.di_size || - sxp->sx_length != tip->i_d.di_size) { - error = XFS_ERROR(EFAULT); - goto error0; - } - - /* - * If the target has extended attributes, the tmp file - * must also in order to ensure the correct data fork - * format. - */ - if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { - error = XFS_ERROR(EINVAL); - goto error0; - } - - /* - * Compare the current change & modify times with that - * passed in. If they differ, we abort this swap. - * This is the mechanism used to ensure the calling - * process that the file was not changed out from - * under it. - */ - if ((sbp->bs_ctime.tv_sec != ip->i_d.di_ctime.t_sec) || - (sbp->bs_ctime.tv_nsec != ip->i_d.di_ctime.t_nsec) || - (sbp->bs_mtime.tv_sec != ip->i_d.di_mtime.t_sec) || - (sbp->bs_mtime.tv_nsec != ip->i_d.di_mtime.t_nsec)) { - error = XFS_ERROR(EBUSY); - goto error0; - } - - /* We need to fail if the file is memory mapped. Once we have tossed - * all existing pages, the page fault will have no option - * but to go to the filesystem for pages. By making the page fault call - * vop_read (or write in the case of autogrow) they block on the iolock - * until we have switched the extents. - */ - if (VN_MAPPED(vp)) { - error = XFS_ERROR(EBUSY); - goto error0; - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_iunlock(tip, XFS_ILOCK_EXCL); - - /* - * There is a race condition here since we gave up the - * ilock. However, the data fork will not change since - * we have the iolock (locked for truncation too) so we - * are safe. We don't really care if non-io related - * fields change. - */ - - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); - if ((error = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - xfs_iunlock(tip, XFS_IOLOCK_EXCL); - xfs_trans_cancel(tp, 0); - locked = 0; - goto error0; - } - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); - - /* - * Count the number of extended attribute blocks - */ - if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) && - (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } - } - if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) && - (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) { - error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK, - &taforkblks); - if (error) { - xfs_trans_cancel(tp, 0); - goto error0; - } - } - - /* - * Swap the data forks of the inodes - */ - ifp = &ip->i_df; - tifp = &tip->i_df; - *tempifp = *ifp; /* struct copy */ - *ifp = *tifp; /* struct copy */ - *tifp = *tempifp; /* struct copy */ - - /* - * Fix the on-disk inode values - */ - tmp = (__uint64_t)ip->i_d.di_nblocks; - ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks; - tip->i_d.di_nblocks = tmp + taforkblks - aforkblks; - - tmp = (__uint64_t) ip->i_d.di_nextents; - ip->i_d.di_nextents = tip->i_d.di_nextents; - tip->i_d.di_nextents = tmp; - - tmp = (__uint64_t) ip->i_d.di_format; - ip->i_d.di_format = tip->i_d.di_format; - tip->i_d.di_format = tmp; - - ilf_fields = XFS_ILOG_CORE; - - switch(ip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } - ilf_fields |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - ilf_fields |= XFS_ILOG_DBROOT; - break; - } - - tilf_fields = XFS_ILOG_CORE; - - switch(tip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. - */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } - tilf_fields |= XFS_ILOG_DEXT; - break; - case XFS_DINODE_FMT_BTREE: - tilf_fields |= XFS_ILOG_DBROOT; - break; - } - - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - VN_HOLD(vp); - VN_HOLD(tvp); - - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); - - xfs_trans_log_inode(tp, ip, ilf_fields); - xfs_trans_log_inode(tp, tip, tilf_fields); - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - */ - if (mp->m_flags & XFS_MOUNT_WSYNC) { - xfs_trans_set_sync(tp); - } - - error = xfs_trans_commit(tp, XFS_TRANS_SWAPEXT, NULL); - locked = 0; - - error0: - if (locked) { - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - } - if (tempifp != NULL) - kmem_free(tempifp, sizeof(xfs_ifork_t)); - return error; -} diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h deleted file mode 100644 index da178205be6..00000000000 --- a/fs/xfs/xfs_dfrag.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DFRAG_H__ -#define __XFS_DFRAG_H__ - -/* - * Structure passed to xfs_swapext - */ - -typedef struct xfs_swapext -{ - __int64_t sx_version; /* version */ - __int64_t sx_fdtarget; /* fd of target file */ - __int64_t sx_fdtmp; /* fd of tmp file */ - xfs_off_t sx_offset; /* offset into file */ - xfs_off_t sx_length; /* leng from offset */ - char sx_pad[16]; /* pad space, unused */ - xfs_bstat_t sx_stat; /* stat of target b4 copy */ -} xfs_swapext_t; - -/* - * Version flag - */ -#define XFS_SX_VERSION 0 - -#ifdef __KERNEL__ -/* - * Prototypes for visible xfs_dfrag.c routines. - */ - -/* - * Syscall interface for xfs_swapext - */ -int xfs_swapext(struct xfs_swapext __user *sx); - -int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, - struct xfs_swapext *sxp); - -#endif /* __KERNEL__ */ - -#endif /* __XFS_DFRAG_H__ */ diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h index b33826961c4..623bbe8fd92 100644 --- a/fs/xfs/xfs_dinode.h +++ b/fs/xfs/xfs_dinode.h @@ -18,85 +18,91 @@ #ifndef __XFS_DINODE_H__ #define __XFS_DINODE_H__ -struct xfs_buf; -struct xfs_mount; +#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ +#define XFS_DINODE_GOOD_VERSION(v) ((v) >= 1 && (v) <= 3) -#define XFS_DINODE_VERSION_1 1 -#define XFS_DINODE_VERSION_2 2 -#define XFS_DINODE_GOOD_VERSION(v) \ - (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2)) -#define XFS_DINODE_MAGIC 0x494e /* 'IN' */ - -/* - * Disk inode structure. - * This is just the header; the inode is expanded to fill a variable size - * with the last field expanding. It is split into the core and "other" - * because we only need the core part in the in-core inode. - */ typedef struct xfs_timestamp { - __int32_t t_sec; /* timestamp seconds */ - __int32_t t_nsec; /* timestamp nanoseconds */ + __be32 t_sec; /* timestamp seconds */ + __be32 t_nsec; /* timestamp nanoseconds */ } xfs_timestamp_t; /* - * Note: Coordinate changes to this structure with the XFS_DI_* #defines - * below and the offsets table in xfs_ialloc_log_di(). + * On-disk inode structure. + * + * This is just the header or "dinode core", the inode is expanded to fill a + * variable size the leftover area split into a data and an attribute fork. + * The format of the data and attribute fork depends on the format of the + * inode as indicated by di_format and di_aformat. To access the data and + * attribute use the XFS_DFORK_DPTR, XFS_DFORK_APTR, and XFS_DFORK_PTR macros + * below. + * + * There is a very similar struct icdinode in xfs_inode which matches the + * layout of the first 96 bytes of this structure, but is kept in native + * format instead of big endian. + * + * Note: di_flushiter is only used by v1/2 inodes - it's effectively a zeroed + * padding field for v3 inodes. */ -typedef struct xfs_dinode_core -{ - __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ - __uint16_t di_mode; /* mode and type of file */ - __int8_t di_version; /* inode version */ - __int8_t di_format; /* format of di_c data */ - __uint16_t di_onlink; /* old number of links to file */ - __uint32_t di_uid; /* owner's user id */ - __uint32_t di_gid; /* owner's group id */ - __uint32_t di_nlink; /* number of links to file */ - __uint16_t di_projid; /* owner's project id */ - __uint8_t di_pad[8]; /* unused, zeroed space */ - __uint16_t di_flushiter; /* incremented on flush */ +typedef struct xfs_dinode { + __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __be16 di_mode; /* mode and type of file */ + __u8 di_version; /* inode version */ + __u8 di_format; /* format of di_c data */ + __be16 di_onlink; /* old number of links to file */ + __be32 di_uid; /* owner's user id */ + __be32 di_gid; /* owner's group id */ + __be32 di_nlink; /* number of links to file */ + __be16 di_projid_lo; /* lower part of owner's project id */ + __be16 di_projid_hi; /* higher part owner's project id */ + __u8 di_pad[6]; /* unused, zeroed space */ + __be16 di_flushiter; /* incremented on flush */ xfs_timestamp_t di_atime; /* time last accessed */ xfs_timestamp_t di_mtime; /* time last modified */ xfs_timestamp_t di_ctime; /* time created/inode modified */ - xfs_fsize_t di_size; /* number of bytes in file */ - xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ - xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ - xfs_extnum_t di_nextents; /* number of extents in data fork */ - xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ - __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ - __int8_t di_aformat; /* format of attr fork's data */ - __uint32_t di_dmevmask; /* DMIG event mask */ - __uint16_t di_dmstate; /* DMIG state info */ - __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ - __uint32_t di_gen; /* generation number */ -} xfs_dinode_core_t; + __be64 di_size; /* number of bytes in file */ + __be64 di_nblocks; /* # of direct & btree blocks used */ + __be32 di_extsize; /* basic/minimum extent size for file */ + __be32 di_nextents; /* number of extents in data fork */ + __be16 di_anextents; /* number of extents in attribute fork*/ + __u8 di_forkoff; /* attr fork offs, <<3 for 64b align */ + __s8 di_aformat; /* format of attr fork's data */ + __be32 di_dmevmask; /* DMIG event mask */ + __be16 di_dmstate; /* DMIG state info */ + __be16 di_flags; /* random flags, XFS_DIFLAG_... */ + __be32 di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + __be32 di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __le32 di_crc; /* CRC of the inode */ + __be64 di_changecount; /* number of attribute changes */ + __be64 di_lsn; /* flush sequence */ + __be64 di_flags2; /* more random flags */ + __u8 di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_timestamp_t di_crtime; /* time created */ + __be64 di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_dinode_t; + +#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) #define DI_MAX_FLUSH 0xffff -typedef struct xfs_dinode +/* + * Size of the core inode on disk. Version 1 and 2 inodes have + * the same size, but version 3 has grown a few additional fields. + */ +static inline uint xfs_dinode_size(int version) { - xfs_dinode_core_t di_core; - /* - * In adding anything between the core and the union, be - * sure to update the macros like XFS_LITINO below and - * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h. - */ - xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ - union { - xfs_bmdr_block_t di_bmbt; /* btree root block */ - xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */ - xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ - char di_c[1]; /* local contents */ - xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */ - uuid_t di_muuid; /* mount point value */ - char di_symlink[1]; /* local symbolic link */ - } di_u; - union { - xfs_bmdr_block_t di_abmbt; /* btree root block */ - xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */ - xfs_attr_shortform_t di_attrsf; /* shortform attribute list */ - } di_a; -} xfs_dinode_t; + if (version == 3) + return sizeof(struct xfs_dinode); + return offsetof(struct xfs_dinode, di_crc); +} /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. @@ -107,50 +113,14 @@ typedef struct xfs_dinode #define XFS_MAXLINK_1 65535U /* - * Bit names for logging disk inodes only - */ -#define XFS_DI_MAGIC 0x0000001 -#define XFS_DI_MODE 0x0000002 -#define XFS_DI_VERSION 0x0000004 -#define XFS_DI_FORMAT 0x0000008 -#define XFS_DI_ONLINK 0x0000010 -#define XFS_DI_UID 0x0000020 -#define XFS_DI_GID 0x0000040 -#define XFS_DI_NLINK 0x0000080 -#define XFS_DI_PROJID 0x0000100 -#define XFS_DI_PAD 0x0000200 -#define XFS_DI_ATIME 0x0000400 -#define XFS_DI_MTIME 0x0000800 -#define XFS_DI_CTIME 0x0001000 -#define XFS_DI_SIZE 0x0002000 -#define XFS_DI_NBLOCKS 0x0004000 -#define XFS_DI_EXTSIZE 0x0008000 -#define XFS_DI_NEXTENTS 0x0010000 -#define XFS_DI_NAEXTENTS 0x0020000 -#define XFS_DI_FORKOFF 0x0040000 -#define XFS_DI_AFORMAT 0x0080000 -#define XFS_DI_DMEVMASK 0x0100000 -#define XFS_DI_DMSTATE 0x0200000 -#define XFS_DI_FLAGS 0x0400000 -#define XFS_DI_GEN 0x0800000 -#define XFS_DI_NEXT_UNLINKED 0x1000000 -#define XFS_DI_U 0x2000000 -#define XFS_DI_A 0x4000000 -#define XFS_DI_NUM_BITS 27 -#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1) -#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A)) - -/* * Values for di_format */ -typedef enum xfs_dinode_fmt -{ - XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */ - XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */ - /* LNK: di_symlink */ - XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */ - XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */ - XFS_DINODE_FMT_UUID /* MNT: di_uuid */ +typedef enum xfs_dinode_fmt { + XFS_DINODE_FMT_DEV, /* xfs_dev_t */ + XFS_DINODE_FMT_LOCAL, /* bulk data */ + XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */ + XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */ + XFS_DINODE_FMT_UUID /* uuid_t */ } xfs_dinode_fmt_t; /* @@ -164,79 +134,62 @@ typedef enum xfs_dinode_fmt /* * Inode size for given fs. */ -#define XFS_LITINO(mp) ((mp)->m_litino) -#define XFS_BROOT_SIZE_ADJ \ - (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t)) +#define XFS_LITINO(mp, version) \ + ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version))) /* * Inode data & attribute fork sizes, per inode. */ -#define XFS_CFORK_Q(dcp) ((dcp)->di_forkoff != 0) -#define XFS_CFORK_Q_DISK(dcp) ((dcp)->di_forkoff != 0) - -#define XFS_CFORK_BOFF(dcp) ((int)((dcp)->di_forkoff << 3)) -#define XFS_CFORK_BOFF_DISK(dcp) \ - ((int)(INT_GET((dcp)->di_forkoff, ARCH_CONVERT) << 3)) - -#define XFS_CFORK_DSIZE_DISK(dcp,mp) \ - (XFS_CFORK_Q_DISK(dcp) ? XFS_CFORK_BOFF_DISK(dcp) : XFS_LITINO(mp)) -#define XFS_CFORK_DSIZE(dcp,mp) \ - (XFS_CFORK_Q(dcp) ? XFS_CFORK_BOFF(dcp) : XFS_LITINO(mp)) - -#define XFS_CFORK_ASIZE_DISK(dcp,mp) \ - (XFS_CFORK_Q_DISK(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF_DISK(dcp) : 0) -#define XFS_CFORK_ASIZE(dcp,mp) \ - (XFS_CFORK_Q(dcp) ? XFS_LITINO(mp) - XFS_CFORK_BOFF(dcp) : 0) - -#define XFS_CFORK_SIZE_DISK(dcp,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_CFORK_DSIZE_DISK(dcp, mp) : \ - XFS_CFORK_ASIZE_DISK(dcp, mp)) -#define XFS_CFORK_SIZE(dcp,mp,w) \ - ((w) == XFS_DATA_FORK ? \ - XFS_CFORK_DSIZE(dcp, mp) : XFS_CFORK_ASIZE(dcp, mp)) +#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0) +#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3)) #define XFS_DFORK_DSIZE(dip,mp) \ - XFS_CFORK_DSIZE_DISK(&(dip)->di_core, mp) -#define XFS_DFORK_DSIZE_HOST(dip,mp) \ - XFS_CFORK_DSIZE(&(dip)->di_core, mp) + (XFS_DFORK_Q(dip) ? \ + XFS_DFORK_BOFF(dip) : \ + XFS_LITINO(mp, (dip)->di_version)) #define XFS_DFORK_ASIZE(dip,mp) \ - XFS_CFORK_ASIZE_DISK(&(dip)->di_core, mp) -#define XFS_DFORK_ASIZE_HOST(dip,mp) \ - XFS_CFORK_ASIZE(&(dip)->di_core, mp) -#define XFS_DFORK_SIZE(dip,mp,w) \ - XFS_CFORK_SIZE_DISK(&(dip)->di_core, mp, w) -#define XFS_DFORK_SIZE_HOST(dip,mp,w) \ - XFS_CFORK_SIZE(&(dip)->di_core, mp, w) + (XFS_DFORK_Q(dip) ? \ + XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \ + 0) +#define XFS_DFORK_SIZE(dip,mp,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_DFORK_DSIZE(dip, mp) : \ + XFS_DFORK_ASIZE(dip, mp)) -#define XFS_DFORK_Q(dip) XFS_CFORK_Q_DISK(&(dip)->di_core) -#define XFS_DFORK_BOFF(dip) XFS_CFORK_BOFF_DISK(&(dip)->di_core) -#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c) -#define XFS_DFORK_APTR(dip) \ - ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip)) -#define XFS_DFORK_PTR(dip,w) \ +/* + * Return pointers to the data or attribute forks. + */ +#define XFS_DFORK_DPTR(dip) \ + ((char *)dip + xfs_dinode_size(dip->di_version)) +#define XFS_DFORK_APTR(dip) \ + (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip)) +#define XFS_DFORK_PTR(dip,w) \ ((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip)) -#define XFS_CFORK_FORMAT(dcp,w) \ - ((w) == XFS_DATA_FORK ? (dcp)->di_format : (dcp)->di_aformat) -#define XFS_CFORK_FMT_SET(dcp,w,n) \ - ((w) == XFS_DATA_FORK ? \ - ((dcp)->di_format = (n)) : ((dcp)->di_aformat = (n))) -#define XFS_DFORK_FORMAT(dip,w) XFS_CFORK_FORMAT(&(dip)->di_core, w) -#define XFS_CFORK_NEXTENTS_DISK(dcp,w) \ +#define XFS_DFORK_FORMAT(dip,w) \ ((w) == XFS_DATA_FORK ? \ - INT_GET((dcp)->di_nextents, ARCH_CONVERT) : \ - INT_GET((dcp)->di_anextents, ARCH_CONVERT)) -#define XFS_CFORK_NEXTENTS(dcp,w) \ - ((w) == XFS_DATA_FORK ? (dcp)->di_nextents : (dcp)->di_anextents) -#define XFS_DFORK_NEXTENTS(dip,w) XFS_CFORK_NEXTENTS_DISK(&(dip)->di_core, w) -#define XFS_DFORK_NEXTENTS_HOST(dip,w) XFS_CFORK_NEXTENTS(&(dip)->di_core, w) - -#define XFS_CFORK_NEXT_SET(dcp,w,n) \ + (dip)->di_format : \ + (dip)->di_aformat) +#define XFS_DFORK_NEXTENTS(dip,w) \ ((w) == XFS_DATA_FORK ? \ - ((dcp)->di_nextents = (n)) : ((dcp)->di_anextents = (n))) + be32_to_cpu((dip)->di_nextents) : \ + be16_to_cpu((dip)->di_anextents)) + +#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)((bp)->b_addr)) + +/* + * For block and character special files the 32bit dev_t is stored at the + * beginning of the data fork. + */ +static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip) +{ + return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip)); +} -#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp)) +static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) +{ + *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev); +} /* * Values for di_flags @@ -257,6 +210,7 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE_BIT 11 /* inode extent size allocator hint */ #define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */ #define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */ +#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */ #define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT) #define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT) #define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT) @@ -271,12 +225,19 @@ typedef enum xfs_dinode_fmt #define XFS_DIFLAG_EXTSIZE (1 << XFS_DIFLAG_EXTSIZE_BIT) #define XFS_DIFLAG_EXTSZINHERIT (1 << XFS_DIFLAG_EXTSZINHERIT_BIT) #define XFS_DIFLAG_NODEFRAG (1 << XFS_DIFLAG_NODEFRAG_BIT) +#define XFS_DIFLAG_FILESTREAM (1 << XFS_DIFLAG_FILESTREAM_BIT) + +#ifdef CONFIG_XFS_RT +#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) +#else +#define XFS_IS_REALTIME_INODE(ip) (0) +#endif #define XFS_DIFLAG_ANY \ (XFS_DIFLAG_REALTIME | XFS_DIFLAG_PREALLOC | XFS_DIFLAG_NEWRTBM | \ XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND | XFS_DIFLAG_SYNC | \ XFS_DIFLAG_NOATIME | XFS_DIFLAG_NODUMP | XFS_DIFLAG_RTINHERIT | \ XFS_DIFLAG_PROJINHERIT | XFS_DIFLAG_NOSYMLINKS | XFS_DIFLAG_EXTSIZE | \ - XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG) + XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) #endif /* __XFS_DINODE_H__ */ diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index 8edbe1adb95..79670cda48a 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -17,54 +17,142 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_dinode.h" -static int xfs_dir2_put_dirent64_direct(xfs_dir2_put_args_t *pa); -static int xfs_dir2_put_dirent64_uio(xfs_dir2_put_args_t *pa); +struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; -void -xfs_dir_mount( - xfs_mount_t *mp) + +/* + * ASCII case-insensitive (ie. A-Z) support for directories that was + * used in IRIX. + */ +STATIC xfs_dahash_t +xfs_ascii_ci_hashname( + struct xfs_name *name) +{ + xfs_dahash_t hash; + int i; + + for (i = 0, hash = 0; i < name->len; i++) + hash = tolower(name->name[i]) ^ rol32(hash, 7); + + return hash; +} + +STATIC enum xfs_dacmp +xfs_ascii_ci_compname( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + enum xfs_dacmp result; + int i; + + if (args->namelen != len) + return XFS_CMP_DIFFERENT; + + result = XFS_CMP_EXACT; + for (i = 0; i < len; i++) { + if (args->name[i] == name[i]) + continue; + if (tolower(args->name[i]) != tolower(name[i])) + return XFS_CMP_DIFFERENT; + result = XFS_CMP_CASE; + } + + return result; +} + +static struct xfs_nameops xfs_ascii_ci_nameops = { + .hashname = xfs_ascii_ci_hashname, + .compname = xfs_ascii_ci_compname, +}; + +int +xfs_da_mount( + struct xfs_mount *mp) { - ASSERT(XFS_SB_VERSION_HASDIRV2(&mp->m_sb)); + struct xfs_da_geometry *dageo; + int nodehdr_size; + + + ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT); ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <= XFS_MAX_BLOCKSIZE); - mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog); - mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog; - mp->m_dirdatablk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_DATA_FIRSTDB(mp)); - mp->m_dirleafblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_LEAF_FIRSTDB(mp)); - mp->m_dirfreeblk = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_FREE_FIRSTDB(mp)); - mp->m_attr_node_ents = - (mp->m_sb.sb_blocksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_node_ents = - (mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) / - (uint)sizeof(xfs_da_node_entry_t); - mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100; + + mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL); + mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL); + + nodehdr_size = mp->m_dir_inode_ops->node_hdr_size; + mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry), + KM_SLEEP | KM_MAYFAIL); + if (!mp->m_dir_geo || !mp->m_attr_geo) { + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); + return ENOMEM; + } + + /* set up directory geometry */ + dageo = mp->m_dir_geo; + dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; + + /* + * Now we've set up the block conversion variables, we can calculate the + * segment block constants using the geometry structure. + */ + dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET); + dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET); + dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET); + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + /* set up attribute geometry - single fsb only */ + dageo = mp->m_attr_geo; + dageo->blklog = mp->m_sb.sb_blocklog; + dageo->fsblog = mp->m_sb.sb_blocklog; + dageo->blksize = 1 << dageo->blklog; + dageo->fsbcount = 1; + dageo->node_ents = (dageo->blksize - nodehdr_size) / + (uint)sizeof(xfs_da_node_entry_t); + dageo->magicpct = (dageo->blksize * 37) / 100; + + if (xfs_sb_version_hasasciici(&mp->m_sb)) + mp->m_dirnameops = &xfs_ascii_ci_nameops; + else + mp->m_dirnameops = &xfs_default_nameops; + + return 0; +} + +void +xfs_da_unmount( + struct xfs_mount *mp) +{ + kmem_free(mp->m_dir_geo); + kmem_free(mp->m_attr_geo); } /* @@ -74,15 +162,15 @@ int xfs_dir_isempty( xfs_inode_t *dp) { - xfs_dir2_sf_t *sfp; + xfs_dir2_sf_hdr_t *sfp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); if (dp->i_d.di_size == 0) /* might happen during shutdown. */ return 1; if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) return 0; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - return !sfp->hdr.count; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + return !sfp->count; } /* @@ -111,7 +199,7 @@ xfs_dir_ino_validate( XFS_AGINO_TO_INO(mp, agno, agino) == ino; if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, XFS_RANDOM_DIR_INO_VALIDATE))) { - xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", + xfs_warn(mp, "Invalid inode number 0x%Lx", (unsigned long long) ino); XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); return XFS_ERROR(EFSCORRUPTED); @@ -128,16 +216,24 @@ xfs_dir_init( xfs_inode_t *dp, xfs_inode_t *pdp) { - xfs_da_args_t args; + struct xfs_da_args *args; int error; - memset((char *)&args, 0, sizeof(args)); - args.dp = dp; - args.trans = tp; - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) + ASSERT(S_ISDIR(dp->i_d.di_mode)); + error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino); + if (error) return error; - return xfs_dir2_sf_create(&args, pdp->i_ino); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->dp = dp; + args->trans = tp; + error = xfs_dir2_sf_create(args, pdp->i_ino); + kmem_free(args); + return error; } /* @@ -147,97 +243,166 @@ int xfs_dir_createname( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t inum, /* new entry inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + ASSERT(S_ISDIR(dp->i_d.di_mode)); + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; XFS_STATS_INC(xs_dir_create); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.justcheck = 0; - args.addname = args.oknoent = 1; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); + +out_free: + kmem_free(args); return rval; } /* + * If doing a CI lookup and case-insensitive match, dup actual name into + * args.value. Return EEXIST for success (ie. name found) or an error. + */ +int +xfs_dir_cilookup_result( + struct xfs_da_args *args, + const unsigned char *name, + int len) +{ + if (args->cmpresult == XFS_CMP_DIFFERENT) + return ENOENT; + if (args->cmpresult != XFS_CMP_CASE || + !(args->op_flags & XFS_DA_OP_CILOOKUP)) + return EEXIST; + + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); + if (!args->value) + return ENOMEM; + + memcpy(args->value, name, len); + args->valuelen = len; + return EEXIST; +} + +/* * Lookup a name in a directory, give back the inode number. + * If ci_name is not NULL, returns the actual name in ci_name if it differs + * to name, or ci_name->name is set to NULL for an exact match. */ + int xfs_dir_lookup( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, - xfs_ino_t *inum) /* out: inode number */ + struct xfs_name *name, + xfs_ino_t *inum, /* out: inode number */ + struct xfs_name *ci_name) /* out: actual name if CI match */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_lookup); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; - args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.justcheck = args.addname = 0; - args.oknoent = 1; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_lookup(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_lookup(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_lookup(&args); + /* + * We need to use KM_NOFS here so that lockdep will not throw false + * positive deadlock warnings on a non-transactional lookup path. It is + * safe to recurse into inode recalim in that case, but lockdep can't + * easily be taught about it. Hence KM_NOFS avoids having to add more + * lockdep Doing this avoids having to add a bunch of lockdep class + * annotations into the reclaim path for the ilock. + */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_OKNOENT; + if (ci_name) + args->op_flags |= XFS_DA_OP_CILOOKUP; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_lookup(args); + goto out_check_rval; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_lookup(args); else - rval = xfs_dir2_node_lookup(&args); + rval = xfs_dir2_node_lookup(args); + +out_check_rval: if (rval == EEXIST) rval = 0; - if (rval == 0) - *inum = args.inumber; + if (!rval) { + *inum = args->inumber; + if (ci_name) { + ci_name->name = args->value; + ci_name->len = args->valuelen; + } + } +out_free: + kmem_free(args); return rval; } @@ -248,92 +413,58 @@ int xfs_dir_removename( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, - int namelen, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); XFS_STATS_INC(xs_dir_remove); - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = ino; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_removename(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_removename(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_removename(&args); - else - rval = xfs_dir2_node_removename(&args); - return rval; -} - -/* - * Read a directory. - */ -int -xfs_dir_getdents( - xfs_trans_t *tp, - xfs_inode_t *dp, - uio_t *uio, /* caller's buffer control */ - int *eofp) /* out: eof reached */ -{ - int alignment; /* alignment required for ABI */ - xfs_dirent_t *dbp; /* malloc'ed buffer */ - xfs_dir2_put_t put; /* entry formatting routine */ - int rval; /* return value */ - int v; /* type-checking value */ + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = ino; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_removename(args); + goto out_free; + } - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - XFS_STATS_INC(xs_dir_getdents); - /* - * If our caller has given us a single contiguous aligned memory buffer, - * just work directly within that buffer. If it's in user memory, - * lock it down first. - */ - alignment = sizeof(xfs_off_t) - 1; - if ((uio->uio_iovcnt == 1) && - (((__psint_t)uio->uio_iov[0].iov_base & alignment) == 0) && - ((uio->uio_iov[0].iov_len & alignment) == 0)) { - dbp = NULL; - put = xfs_dir2_put_dirent64_direct; - } else { - dbp = kmem_alloc(sizeof(*dbp) + MAXNAMELEN, KM_SLEEP); - put = xfs_dir2_put_dirent64_uio; + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_removename(args); + goto out_free; } - *eofp = 0; - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_getdents(dp, uio, eofp, dbp, put); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - ; - else if (v) - rval = xfs_dir2_block_getdents(tp, dp, uio, eofp, dbp, put); + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_removename(args); else - rval = xfs_dir2_leaf_getdents(tp, dp, uio, eofp, dbp, put); - if (dbp != NULL) - kmem_free(dbp, sizeof(*dbp) + MAXNAMELEN); + rval = xfs_dir2_node_removename(args); +out_free: + kmem_free(args); return rval; } @@ -344,89 +475,121 @@ int xfs_dir_replace( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to replace */ - int namelen, + struct xfs_name *name, /* name of entry to replace */ xfs_ino_t inum, /* new inode number */ xfs_fsblock_t *first, /* bmap's firstblock */ xfs_bmap_free_t *flist, /* bmap's freeblock list */ xfs_extlen_t total) /* bmap's total block count */ { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); + ASSERT(S_ISDIR(dp->i_d.di_mode)); - if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) return rval; - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = inum; - args.dp = dp; - args.firstblock = first; - args.flist = flist; - args.total = total; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.justcheck = args.addname = args.oknoent = 0; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_replace(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_replace(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_replace(&args); + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->inumber = inum; + args->dp = dp; + args->firstblock = first; + args->flist = flist; + args->total = total; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_replace(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_replace(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_replace(args); else - rval = xfs_dir2_node_replace(&args); + rval = xfs_dir2_node_replace(args); +out_free: + kmem_free(args); return rval; } /* * See if this entry can be added to the directory without allocating space. + * First checks that the caller couldn't reserve enough space (resblks = 0). */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, - char *name, /* name of entry to add */ - int namelen) + struct xfs_name *name, /* name of entry to add */ + uint resblks) { - xfs_da_args_t args; + struct xfs_da_args *args; int rval; int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); - - args.name = name; - args.namelen = namelen; - args.hashval = xfs_da_hashname(name, namelen); - args.inumber = 0; - args.dp = dp; - args.firstblock = NULL; - args.flist = NULL; - args.total = 0; - args.whichfork = XFS_DATA_FORK; - args.trans = tp; - args.justcheck = args.addname = args.oknoent = 1; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) - rval = xfs_dir2_sf_addname(&args); - else if ((rval = xfs_dir2_isblock(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_block_addname(&args); - else if ((rval = xfs_dir2_isleaf(tp, dp, &v))) - return rval; - else if (v) - rval = xfs_dir2_leaf_addname(&args); + if (resblks) + return 0; + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + + args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); + if (!args) + return ENOMEM; + + args->geo = dp->i_mount->m_dir_geo; + args->name = name->name; + args->namelen = name->len; + args->filetype = name->type; + args->hashval = dp->i_mount->m_dirnameops->hashname(name); + args->dp = dp; + args->whichfork = XFS_DATA_FORK; + args->trans = tp; + args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | + XFS_DA_OP_OKNOENT; + + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + rval = xfs_dir2_sf_addname(args); + goto out_free; + } + + rval = xfs_dir2_isblock(args, &v); + if (rval) + goto out_free; + if (v) { + rval = xfs_dir2_block_addname(args); + goto out_free; + } + + rval = xfs_dir2_isleaf(args, &v); + if (rval) + goto out_free; + if (v) + rval = xfs_dir2_leaf_addname(args); else - rval = xfs_dir2_node_addname(&args); + rval = xfs_dir2_node_addname(args); +out_free: + kmem_free(args); return rval; } @@ -436,125 +599,36 @@ xfs_dir_canenter( /* * Add a block to the directory. - * This routine is for data and free blocks, not leaf/node blocks - * which are handled by xfs_da_grow_inode. + * + * This routine is for data and free blocks, not leaf/node blocks which are + * handled by xfs_da_grow_inode. */ int xfs_dir2_grow_inode( - xfs_da_args_t *args, - int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ - xfs_dir2_db_t *dbp) /* out: block number added */ + struct xfs_da_args *args, + int space, /* v2 dir's space XFS_DIR2_xxx_SPACE */ + xfs_dir2_db_t *dbp) /* out: block number added */ { - xfs_fileoff_t bno; /* directory offset of new block */ - int count; /* count of filesystem blocks */ - xfs_inode_t *dp; /* incore directory inode */ - int error; - int got; /* blocks actually mapped */ - int i; - xfs_bmbt_irec_t map; /* single structure for bmap */ - int mapi; /* mapping index */ - xfs_bmbt_irec_t *mapp; /* bmap mapping structure(s) */ - xfs_mount_t *mp; - int nmap; /* number of bmap entries */ - xfs_trans_t *tp; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + xfs_fileoff_t bno; /* directory offset of new block */ + int count; /* count of filesystem blocks */ + int error; + + trace_xfs_dir2_grow_inode(args, space); - xfs_dir2_trace_args_s("grow_inode", args, space); - dp = args->dp; - tp = args->trans; - mp = dp->i_mount; /* * Set lowest possible block in the space requested. */ bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE); - count = mp->m_dirblkfsbs; - /* - * Find the first hole for our block. - */ - if ((error = xfs_bmap_first_unused(tp, dp, count, &bno, XFS_DATA_FORK))) - return error; - nmap = 1; - ASSERT(args->firstblock != NULL); - /* - * Try mapping the new block contiguously (one extent). - */ - if ((error = xfs_bmapi(tp, dp, bno, count, - XFS_BMAPI_WRITE|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, - args->firstblock, args->total, &map, &nmap, - args->flist, NULL))) + count = args->geo->fsbcount; + + error = xfs_da_grow_inode_int(args, &bno, count); + if (error) return error; - ASSERT(nmap <= 1); - if (nmap == 1) { - mapp = ↦ - mapi = 1; - } - /* - * Didn't work and this is a multiple-fsb directory block. - * Try again with contiguous flag turned on. - */ - else if (nmap == 0 && count > 1) { - xfs_fileoff_t b; /* current file offset */ - /* - * Space for maximum number of mappings. - */ - mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP); - /* - * Iterate until we get to the end of our block. - */ - for (b = bno, mapi = 0; b < bno + count; ) { - int c; /* current fsb count */ - - /* - * Can't map more than MAX_NMAP at once. - */ - nmap = MIN(XFS_BMAP_MAX_NMAP, count); - c = (int)(bno + count - b); - if ((error = xfs_bmapi(tp, dp, b, c, - XFS_BMAPI_WRITE|XFS_BMAPI_METADATA, - args->firstblock, args->total, - &mapp[mapi], &nmap, args->flist, - NULL))) { - kmem_free(mapp, sizeof(*mapp) * count); - return error; - } - if (nmap < 1) - break; - /* - * Add this bunch into our table, go to the next offset. - */ - mapi += nmap; - b = mapp[mapi - 1].br_startoff + - mapp[mapi - 1].br_blockcount; - } - } - /* - * Didn't work. - */ - else { - mapi = 0; - mapp = NULL; - } - /* - * See how many fsb's we got. - */ - for (i = 0, got = 0; i < mapi; i++) - got += mapp[i].br_blockcount; - /* - * Didn't get enough fsb's, or the first/last block's are wrong. - */ - if (got != count || mapp[0].br_startoff != bno || - mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != - bno + count) { - if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); - return XFS_ERROR(ENOSPC); - } - /* - * Done with the temporary mapping table. - */ - if (mapp != &map) - kmem_free(mapp, sizeof(*mapp) * count); - *dbp = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)bno); + *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno); + /* * Update file's size if this is the data space and it grew. */ @@ -564,7 +638,7 @@ xfs_dir2_grow_inode( size = XFS_FSB_TO_B(mp, bno + count); if (size > dp->i_d.di_size) { dp->i_d.di_size = size; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE); } } return 0; @@ -575,19 +649,16 @@ xfs_dir2_grow_inode( */ int xfs_dir2_isblock( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is block, 0 is not block */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize; - ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize); + rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; + ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); *vp = rval; return 0; } @@ -597,93 +668,19 @@ xfs_dir2_isblock( */ int xfs_dir2_isleaf( - xfs_trans_t *tp, - xfs_inode_t *dp, - int *vp) /* out: 1 is leaf, 0 is not leaf */ + struct xfs_da_args *args, + int *vp) /* out: 1 is block, 0 is not block */ { - xfs_fileoff_t last; /* last file offset */ - xfs_mount_t *mp; - int rval; + xfs_fileoff_t last; /* last file offset */ + int rval; - mp = dp->i_mount; - if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK))) + if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; - *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog); + *vp = last == args->geo->leafblk + args->geo->fsbcount; return 0; } /* - * Getdents put routine for 64-bit ABI, direct form. - */ -static int -xfs_dir2_put_dirent64_direct( - xfs_dir2_put_args_t *pa) -{ - xfs_dirent_t *idbp; /* dirent pointer */ - iovec_t *iovp; /* io vector */ - int namelen; /* entry name length */ - int reclen; /* entry total length */ - uio_t *uio; /* I/O control */ - - namelen = pa->namelen; - reclen = DIRENTSIZE(namelen); - uio = pa->uio; - /* - * Won't fit in the remaining space. - */ - if (reclen > uio->uio_resid) { - pa->done = 0; - return 0; - } - iovp = uio->uio_iov; - idbp = (xfs_dirent_t *)iovp->iov_base; - iovp->iov_base = (char *)idbp + reclen; - iovp->iov_len -= reclen; - uio->uio_resid -= reclen; - idbp->d_reclen = reclen; - idbp->d_ino = pa->ino; - idbp->d_off = pa->cook; - idbp->d_name[namelen] = '\0'; - pa->done = 1; - memcpy(idbp->d_name, pa->name, namelen); - return 0; -} - -/* - * Getdents put routine for 64-bit ABI, uio form. - */ -static int -xfs_dir2_put_dirent64_uio( - xfs_dir2_put_args_t *pa) -{ - xfs_dirent_t *idbp; /* dirent pointer */ - int namelen; /* entry name length */ - int reclen; /* entry total length */ - int rval; /* return value */ - uio_t *uio; /* I/O control */ - - namelen = pa->namelen; - reclen = DIRENTSIZE(namelen); - uio = pa->uio; - /* - * Won't fit in the remaining space. - */ - if (reclen > uio->uio_resid) { - pa->done = 0; - return 0; - } - idbp = pa->dbp; - idbp->d_reclen = reclen; - idbp->d_ino = pa->ino; - idbp->d_off = pa->cook; - idbp->d_name[namelen] = '\0'; - memcpy(idbp->d_name, pa->name, namelen); - rval = uio_read((caddr_t)idbp, reclen, uio); - pa->done = (rval == 0); - return rval; -} - -/* * Remove the given block from the directory. * This routine is used for data and free blocks, leaf/node are done * by xfs_da_shrink_inode. @@ -692,7 +689,7 @@ int xfs_dir2_shrink_inode( xfs_da_args_t *args, xfs_dir2_db_t db, - xfs_dabuf_t *bp) + struct xfs_buf *bp) { xfs_fileoff_t bno; /* directory file offset */ xfs_dablk_t da; /* directory file offset */ @@ -702,17 +699,18 @@ xfs_dir2_shrink_inode( xfs_mount_t *mp; xfs_trans_t *tp; - xfs_dir2_trace_args_db("shrink_inode", args, db, bp); + trace_xfs_dir2_shrink_inode(args, db); + dp = args->dp; mp = dp->i_mount; tp = args->trans; - da = XFS_DIR2_DB_TO_DA(mp, db); + da = xfs_dir2_db_to_da(args->geo, db); /* * Unmap the fsblock(s). */ - if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs, + if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount, XFS_BMAPI_METADATA, 0, args->firstblock, args->flist, - NULL, &done))) { + &done))) { /* * ENOSPC actually can happen if we're in a removename with * no space reservation, and the resulting block removal @@ -733,16 +731,16 @@ xfs_dir2_shrink_inode( /* * Invalidate the buffer from the transaction. */ - xfs_da_binval(tp, bp); + xfs_trans_binval(tp, bp); /* * If it's not a data block, we're done. */ - if (db >= XFS_DIR2_LEAF_FIRSTDB(mp)) + if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)) return 0; /* * If the block isn't the last one in the directory, we're done. */ - if (dp->i_d.di_size > XFS_DIR2_DB_OFF_TO_BYTE(mp, db + 1, 0)) + if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0)) return 0; bno = da; if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) { @@ -751,7 +749,7 @@ xfs_dir2_shrink_inode( */ return error; } - if (db == mp->m_dirdatablk) + if (db == args->geo->datablk) ASSERT(bno == 0); else ASSERT(bno > 0); diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h index 86560b6f794..c8e86b0b5e9 100644 --- a/fs/xfs/xfs_dir2.h +++ b/fs/xfs/xfs_dir2.h @@ -16,102 +16,165 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_DIR2_H__ -#define __XFS_DIR2_H__ +#define __XFS_DIR2_H__ -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_put_args; struct xfs_bmap_free; +struct xfs_da_args; struct xfs_inode; struct xfs_mount; struct xfs_trans; +struct xfs_dir2_sf_hdr; +struct xfs_dir2_sf_entry; +struct xfs_dir2_data_hdr; +struct xfs_dir2_data_entry; +struct xfs_dir2_data_unused; -/* - * Directory version 2. - * There are 4 possible formats: - * shortform - * single block - data with embedded leaf at the end - * multiple data blocks, single leaf+freeindex block - * data blocks, node&leaf blocks (btree), freeindex blocks - * - * The shortform format is in xfs_dir2_sf.h. - * The single block format is in xfs_dir2_block.h. - * The data block format is in xfs_dir2_data.h. - * The leaf and freeindex block formats are in xfs_dir2_leaf.h. - * Node blocks are the same as the other version, in xfs_da_btree.h. - */ +extern struct xfs_name xfs_name_dotdot; /* - * Byte offset in data block and shortform entry. + * directory operations vector for encode/decode routines */ -typedef __uint16_t xfs_dir2_data_off_t; -#define NULLDATAOFF 0xffffU -typedef uint xfs_dir2_data_aoff_t; /* argument form */ +struct xfs_dir_ops { + int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len); + struct xfs_dir2_sf_entry * + (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep, + __uint8_t ftype); + xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep); + void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr, + struct xfs_dir2_sf_entry *sfep, + xfs_ino_t ino); + xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr); + void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr, + xfs_ino_t ino); -/* - * Directory block number (logical dirblk in file) - */ -typedef __uint32_t xfs_dir2_db_t; + int (*data_entsize)(int len); + __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep); + void (*data_put_ftype)(struct xfs_dir2_data_entry *dep, + __uint8_t ftype); + __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep); + struct xfs_dir2_data_free * + (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr); -/* - * Byte offset in a directory. - */ -typedef xfs_off_t xfs_dir2_off_t; + xfs_dir2_data_aoff_t data_dot_offset; + xfs_dir2_data_aoff_t data_dotdot_offset; + xfs_dir2_data_aoff_t data_first_offset; + size_t data_entry_offset; -/* - * For getdents, argument struct for put routines. - */ -typedef int (*xfs_dir2_put_t)(struct xfs_dir2_put_args *pa); -typedef struct xfs_dir2_put_args { - xfs_off_t cook; /* cookie of (next) entry */ - xfs_intino_t ino; /* inode number */ - xfs_dirent_t *dbp; /* buffer pointer */ - char *name; /* directory entry name */ - int namelen; /* length of name */ - int done; /* output: set if value was stored */ - xfs_dir2_put_t put; /* put function ptr (i/o) */ - struct uio *uio; /* uio control structure */ -} xfs_dir2_put_args_t; + struct xfs_dir2_data_entry * + (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_entry * + (*data_entry_p)(struct xfs_dir2_data_hdr *hdr); + struct xfs_dir2_data_unused * + (*data_unused_p)(struct xfs_dir2_data_hdr *hdr); + + int leaf_hdr_size; + void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to, + struct xfs_dir3_icleaf_hdr *from); + void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to, + struct xfs_dir2_leaf *from); + int (*leaf_max_ents)(struct xfs_da_geometry *geo); + struct xfs_dir2_leaf_entry * + (*leaf_ents_p)(struct xfs_dir2_leaf *lp); + + int node_hdr_size; + void (*node_hdr_to_disk)(struct xfs_da_intnode *to, + struct xfs_da3_icnode_hdr *from); + void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to, + struct xfs_da_intnode *from); + struct xfs_da_node_entry * + (*node_tree_p)(struct xfs_da_intnode *dap); + + int free_hdr_size; + void (*free_hdr_to_disk)(struct xfs_dir2_free *to, + struct xfs_dir3_icfree_hdr *from); + void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to, + struct xfs_dir2_free *from); + int (*free_max_bests)(struct xfs_da_geometry *geo); + __be16 * (*free_bests_p)(struct xfs_dir2_free *free); + xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); + int (*db_to_fdindex)(struct xfs_da_geometry *geo, + xfs_dir2_db_t db); +}; + +extern const struct xfs_dir_ops * + xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); +extern const struct xfs_dir_ops * + xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp); /* * Generic directory interface routines */ extern void xfs_dir_startup(void); -extern void xfs_dir_mount(struct xfs_mount *mp); +extern int xfs_da_mount(struct xfs_mount *mp); +extern void xfs_da_unmount(struct xfs_mount *mp); + extern int xfs_dir_isempty(struct xfs_inode *dp); extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, struct xfs_inode *pdp); extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t *inum); + struct xfs_name *name, xfs_ino_t *inum, + struct xfs_name *ci_name); extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t ino, + struct xfs_name *name, xfs_ino_t ino, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); -extern int xfs_dir_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - uio_t *uio, int *eofp); extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen, xfs_ino_t inum, + struct xfs_name *name, xfs_ino_t inum, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - char *name, int namelen); -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + struct xfs_name *name, uint resblks); /* - * Utility routines for v2 directories. + * Direct call from the bmap code, bypassing the generic directory layer. */ -extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, - xfs_dir2_db_t *dbp); -extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, - int *vp); -extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, - int *vp); +extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); + +/* + * Interface routines used by userspace utilities + */ +extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r); +extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, - struct xfs_dabuf *bp); + struct xfs_buf *bp); + +extern void xfs_dir2_data_freescan(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, int *loghead); +extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_entry *dep); +extern void xfs_dir2_data_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern void xfs_dir2_data_log_unused(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup); +extern void xfs_dir2_data_make_free(struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, + xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp); +extern void xfs_dir2_data_use_free(struct xfs_da_args *args, + struct xfs_buf *bp, struct xfs_dir2_data_unused *dup, + xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len, + int *needlogp, int *needscanp); + +extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( + struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, + struct xfs_dir2_data_unused *dup); + +extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_free_buf_ops; +extern const struct xfs_buf_ops xfs_dir3_data_buf_ops; #endif /* __XFS_DIR2_H__ */ diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index 9d7438bba30..c7cd3154026 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,34 +18,33 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_trace.h" +#include "xfs_bmap.h" +#include "xfs_buf_item.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" /* * Local function prototypes. */ -static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, xfs_dabuf_t *bp, int first, - int last); -static void xfs_dir2_block_log_tail(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **bpp, +static void xfs_dir2_block_log_leaf(xfs_trans_t *tp, struct xfs_buf *bp, + int first, int last); +static void xfs_dir2_block_log_tail(xfs_trans_t *tp, struct xfs_buf *bp); +static int xfs_dir2_block_lookup_int(xfs_da_args_t *args, struct xfs_buf **bpp, int *entno); static int xfs_dir2_block_sort(const void *a, const void *b); @@ -56,8 +56,275 @@ static xfs_dahash_t xfs_dir_hash_dot, xfs_dir_hash_dotdot; void xfs_dir_startup(void) { - xfs_dir_hash_dot = xfs_da_hashname(".", 1); - xfs_dir_hash_dotdot = xfs_da_hashname("..", 2); + xfs_dir_hash_dot = xfs_da_hashname((unsigned char *)".", 1); + xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2); +} + +static bool +xfs_dir3_block_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +static void +xfs_dir3_block_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_block_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_block_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_block_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .verify_read = xfs_dir3_block_read_verify, + .verify_write = xfs_dir3_block_write_verify, +}; + +int +xfs_dir3_block_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dp->i_mount; + int err; + + err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp, + XFS_DATA_FORK, &xfs_dir3_block_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF); + return err; +} + +static void +xfs_dir3_block_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *dp) +{ + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + bp->b_ops = &xfs_dir3_block_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + return; + + } + hdr3->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); +} + +static void +xfs_dir2_block_need_space( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + __be16 **tagpp, + struct xfs_dir2_data_unused **dupp, + struct xfs_dir2_data_unused **enddupp, + int *compact, + int len) +{ + struct xfs_dir2_data_free *bf; + __be16 *tagp = NULL; + struct xfs_dir2_data_unused *dup = NULL; + struct xfs_dir2_data_unused *enddup = NULL; + + *compact = 0; + bf = dp->d_ops->data_bestfree_p(hdr); + + /* + * If there are stale entries we'll use one for the leaf. + */ + if (btp->stale) { + if (be16_to_cpu(bf[0].length) >= len) { + /* + * The biggest entry enough to avoid compaction. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + goto out; + } + + /* + * Will need to compact to make this work. + * Tag just before the first leaf entry. + */ + *compact = 1; + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then the data will go where the + * leaf data starts now, if it works at all. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * + (uint)sizeof(*blp) < len) + dup = NULL; + } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) + dup = NULL; + else + dup = (xfs_dir2_data_unused_t *)blp; + goto out; + } + + /* + * no stale entries, so just use free space. + * Tag just before the first leaf entry. + */ + tagp = (__be16 *)blp - 1; + + /* Data object just before the first leaf entry. */ + enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); + + /* + * If it's not free then can't do this add without cleaning up: + * the space before the first leaf entry needs to be free so it + * can be expanded to hold the pointer to the new entry. + */ + if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + /* + * Check out the biggest freespace and see if it's the same one. + */ + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[0].offset)); + if (dup != enddup) { + /* + * Not the same free entry, just check its length. + */ + if (be16_to_cpu(dup->length) < len) + dup = NULL; + goto out; + } + + /* + * It is the biggest freespace, can it hold the leaf too? + */ + if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { + /* + * Yes, use the second-largest entry instead if it works. + */ + if (be16_to_cpu(bf[1].length) >= len) + dup = (xfs_dir2_data_unused_t *) + ((char *)hdr + be16_to_cpu(bf[1].offset)); + else + dup = NULL; + } + } +out: + *tagpp = tagp; + *dupp = dup; + *enddupp = enddup; +} + +/* + * compact the leaf entries. + * Leave the highest-numbered stale entry stale. + * XXX should be the one closest to mid but mid is not yet computed. + */ +static void +xfs_dir2_block_compact( + struct xfs_da_args *args, + struct xfs_buf *bp, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_block_tail *btp, + struct xfs_dir2_leaf_entry *blp, + int *needlog, + int *lfloghigh, + int *lfloglow) +{ + int fromidx; /* source leaf index */ + int toidx; /* target leaf index */ + int needscan = 0; + int highstale; /* high stale index */ + + fromidx = toidx = be32_to_cpu(btp->count) - 1; + highstale = *lfloghigh = -1; + for (; fromidx >= 0; fromidx--) { + if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { + if (highstale == -1) + highstale = toidx; + else { + if (*lfloghigh == -1) + *lfloghigh = toidx; + continue; + } + } + if (fromidx < toidx) + blp[toidx] = blp[fromidx]; + toidx--; + } + *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); + *lfloghigh -= be32_to_cpu(btp->stale) - 1; + be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1)); + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), + needlog, &needscan); + btp->stale = cpu_to_be32(1); + /* + * If we now need to rebuild the bestfree map, do so. + * This needs to happen before the next call to use_free. + */ + if (needscan) + xfs_dir2_data_freescan(args->dp, hdr, needlog); } /* @@ -67,10 +334,9 @@ int /* error */ xfs_dir2_block_addname( xfs_da_args_t *args) /* directory op arguments */ { - xfs_dir2_data_free_t *bf; /* bestfree table in block */ - xfs_dir2_block_t *block; /* directory block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* buffer for block */ + struct xfs_buf *bp; /* buffer for block */ xfs_dir2_block_tail_t *btp; /* block tail */ int compact; /* need to compact leaf ents */ xfs_dir2_data_entry_t *dep; /* block data entry */ @@ -93,208 +359,79 @@ xfs_dir2_block_addname( __be16 *tagp; /* pointer to tag value */ xfs_trans_t *tp; /* transaction structure */ - xfs_dir2_trace_args("block_addname", args); + trace_xfs_dir2_block_addname(args); + dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the (one and only) directory block into dabuf bp. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + /* Read the (one and only) directory block into bp. */ + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); - block = bp->data; - /* - * Check the magic number, corrupted if wrong. - */ - if (unlikely(be32_to_cpu(block->hdr.magic) != XFS_DIR2_BLOCK_MAGIC)) { - XFS_CORRUPTION_ERROR("xfs_dir2_block_addname", - XFS_ERRLEVEL_LOW, mp, block); - xfs_da_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - len = XFS_DIR2_DATA_ENTSIZE(args->namelen); + + len = dp->d_ops->data_entsize(args->namelen); + /* * Set up pointers to parts of the block. */ - bf = block->hdr.bestfree; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); - /* - * No stale entries? Need space for entry and new leaf. - */ - if (!btp->stale) { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - enddup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); - /* - * If it's not free then can't do this add without cleaning up: - * the space before the first leaf entry needs to be free so it - * can be expanded to hold the pointer to the new entry. - */ - if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG) - dup = enddup = NULL; - /* - * Check out the biggest freespace and see if it's the same one. - */ - else { - dup = (xfs_dir2_data_unused_t *) - ((char *)block + be16_to_cpu(bf[0].offset)); - if (dup == enddup) { - /* - * It is the biggest freespace, is it too small - * to hold the new leaf too? - */ - if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) { - /* - * Yes, we use the second-largest - * entry instead if it works. - */ - if (be16_to_cpu(bf[1].length) >= len) - dup = (xfs_dir2_data_unused_t *) - ((char *)block + - be16_to_cpu(bf[1].offset)); - else - dup = NULL; - } - } else { - /* - * Not the same free entry, - * just check its length. - */ - if (be16_to_cpu(dup->length) < len) { - dup = NULL; - } - } - } - compact = 0; - } + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + /* - * If there are stale entries we'll use one for the leaf. - * Is the biggest entry enough to avoid compaction? + * Find out if we can reuse stale entries or whether we need extra + * space for entry and new leaf. */ - else if (be16_to_cpu(bf[0].length) >= len) { - dup = (xfs_dir2_data_unused_t *) - ((char *)block + be16_to_cpu(bf[0].offset)); - compact = 0; - } + xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup, + &enddup, &compact, len); + /* - * Will need to compact to make this work. + * Done everything we need for a space check now. */ - else { - /* - * Tag just before the first leaf entry. - */ - tagp = (__be16 *)blp - 1; - /* - * Data object just before the first leaf entry. - */ - dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); - /* - * If it's not free then the data will go where the - * leaf data starts now, if it works at all. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) * - (uint)sizeof(*blp) < len) - dup = NULL; - } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len) - dup = NULL; - else - dup = (xfs_dir2_data_unused_t *)blp; - compact = 1; + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, bp); + if (!dup) + return XFS_ERROR(ENOSPC); + return 0; } - /* - * If this isn't a real add, we're done with the buffer. - */ - if (args->justcheck) - xfs_da_brelse(tp, bp); + /* * If we don't have space for the new entry & leaf ... */ if (!dup) { - /* - * Not trying to actually do anything, or don't have - * a space reservation: return no-space. - */ - if (args->justcheck || args->total == 0) + /* Don't have a space reservation: return no-space. */ + if (args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to the next larger format. * Then add the new entry in that format. */ error = xfs_dir2_block_to_leaf(args, bp); - xfs_da_buf_done(bp); if (error) return error; return xfs_dir2_leaf_addname(args); } - /* - * Just checking, and it would work, so say so. - */ - if (args->justcheck) - return 0; + needlog = needscan = 0; + /* * If need to compact the leaf entries, do it now. - * Leave the highest-numbered stale entry stale. - * XXX should be the one closest to mid but mid is not yet computed. */ if (compact) { - int fromidx; /* source leaf index */ - int toidx; /* target leaf index */ - - for (fromidx = toidx = be32_to_cpu(btp->count) - 1, - highstale = lfloghigh = -1; - fromidx >= 0; - fromidx--) { - if (be32_to_cpu(blp[fromidx].address) == XFS_DIR2_NULL_DATAPTR) { - if (highstale == -1) - highstale = toidx; - else { - if (lfloghigh == -1) - lfloghigh = toidx; - continue; - } - } - if (fromidx < toidx) - blp[toidx] = blp[fromidx]; - toidx--; - } - lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1); - lfloghigh -= be32_to_cpu(btp->stale) - 1; - be32_add(&btp->count, -(be32_to_cpu(btp->stale) - 1)); - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), - (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)), - &needlog, &needscan); - blp += be32_to_cpu(btp->stale) - 1; - btp->stale = cpu_to_be32(1); + xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog, + &lfloghigh, &lfloglow); + /* recalculate blp post-compaction */ + blp = xfs_dir2_block_leaf_p(btp); + } else if (btp->stale) { /* - * If we now need to rebuild the bestfree map, do so. - * This needs to happen before the next call to use_free. + * Set leaf logging boundaries to impossible state. + * For the no-stale case they're set explicitly. */ - if (needscan) { - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, - &needlog, NULL); - needscan = 0; - } - } - /* - * Set leaf logging boundaries to impossible state. - * For the no-stale case they're set explicitly. - */ - else if (btp->stale) { lfloglow = be32_to_cpu(btp->count); lfloghigh = -1; } + /* * Find the slot that's first lower than our hash value, -1 if none. */ @@ -317,23 +454,22 @@ xfs_dir2_block_addname( /* * Mark the space needed for the new leaf entry, now in use. */ - xfs_dir2_data_use_free(tp, bp, enddup, + xfs_dir2_data_use_free(args, bp, enddup, (xfs_dir2_data_aoff_t) - ((char *)enddup - (char *)block + be16_to_cpu(enddup->length) - + ((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) - sizeof(*blp)), (xfs_dir2_data_aoff_t)sizeof(*blp), &needlog, &needscan); /* * Update the tail (entry count). */ - be32_add(&btp->count, 1); + be32_add_cpu(&btp->count, 1); /* * If we now need to rebuild the bestfree map, do so. * This needs to happen before the next call to use_free. */ if (needscan) { - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, - &needlog, NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); needscan = 0; } /* @@ -354,12 +490,14 @@ xfs_dir2_block_addname( else { for (lowstale = mid; lowstale >= 0 && - be32_to_cpu(blp[lowstale].address) != XFS_DIR2_NULL_DATAPTR; + blp[lowstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR); lowstale--) continue; for (highstale = mid + 1; highstale < be32_to_cpu(btp->count) && - be32_to_cpu(blp[highstale].address) != XFS_DIR2_NULL_DATAPTR && + blp[highstale].address != + cpu_to_be32(XFS_DIR2_NULL_DATAPTR) && (lowstale < 0 || mid - lowstale > highstale - mid); highstale++) continue; @@ -387,7 +525,7 @@ xfs_dir2_block_addname( lfloglow = MIN(mid, lfloglow); lfloghigh = MAX(highstale, lfloghigh); } - be32_add(&btp->stale, -1); + be32_add_cpu(&btp->stale, -1); } /* * Point to the new data entry. @@ -397,14 +535,14 @@ xfs_dir2_block_addname( * Fill in the leaf entry. */ blp[mid].hashval = cpu_to_be32(args->hashval); - blp[mid].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, - (char *)dep - (char *)block)); + blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh); /* * Mark space for the data entry used. */ - xfs_dir2_data_use_free(tp, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), (xfs_dir2_data_aoff_t)len, &needlog, &needscan); /* * Create the new data entry. @@ -412,145 +550,19 @@ xfs_dir2_block_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, args->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Clean up the bestfree array and log the header, tail, and entry. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); + xfs_dir2_data_log_header(args, bp); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_log_entry(tp, bp, dep); - xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); - return 0; -} - -/* - * Readdir for block directories. - */ -int /* error */ -xfs_dir2_block_getdents( - xfs_trans_t *tp, /* transaction (NULL) */ - xfs_inode_t *dp, /* incore inode */ - uio_t *uio, /* caller's buffer control */ - int *eofp, /* eof reached? (out) */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* abi's formatting function */ -{ - xfs_dir2_block_t *block; /* directory block structure */ - xfs_dabuf_t *bp; /* buffer for block */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_dir2_data_entry_t *dep; /* block data entry */ - xfs_dir2_data_unused_t *dup; /* block unused entry */ - char *endptr; /* end of the data entries */ - int error; /* error return value */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_put_args_t p; /* arg package for put rtn */ - char *ptr; /* current data entry */ - int wantoff; /* starting block offset */ - - mp = dp->i_mount; - /* - * If the block number in the offset is out of range, we're done. - */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, uio->uio_offset) > mp->m_dirdatablk) { - *eofp = 1; - return 0; - } - /* - * Can't read the block, give up, else get dabuf in bp. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { - return error; - } - ASSERT(bp != NULL); - /* - * Extract the byte offset we start at from the seek pointer. - * We'll skip entries before this. - */ - wantoff = XFS_DIR2_DATAPTR_TO_OFF(mp, uio->uio_offset); - block = bp->data; - xfs_dir2_data_check(dp, bp); - /* - * Set up values for the loop. - */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); - p.dbp = dbp; - p.put = put; - p.uio = uio; - /* - * Loop over the data portion of the block. - * Each object is a real entry (dep) or an unused one (dup). - */ - while (ptr < endptr) { - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * Unused, skip it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ptr += be16_to_cpu(dup->length); - continue; - } - - dep = (xfs_dir2_data_entry_t *)ptr; - - /* - * Bump pointer for the next iteration. - */ - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); - /* - * The entry is before the desired starting point, skip it. - */ - if ((char *)dep - (char *)block < wantoff) - continue; - /* - * Set up argument structure for put routine. - */ - p.namelen = dep->namelen; - - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - ptr - (char *)block); - p.ino = be64_to_cpu(dep->inumber); -#if XFS_BIG_INUMS - p.ino += mp->m_inoadd; -#endif - p.name = (char *)dep->name; - - /* - * Put the entry in the caller's buffer. - */ - error = p.put(&p); - - /* - * If it didn't fit, set the final offset to here & return. - */ - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - (char *)dep - (char *)block); - xfs_da_brelse(tp, bp); - return error; - } - } - - /* - * Reached the end of the block. - * Set the offset to a non-existent block 1 and return. - */ - *eofp = 1; - - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); - - xfs_da_brelse(tp, bp); - + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); return 0; } @@ -560,21 +572,18 @@ xfs_dir2_block_getdents( static void xfs_dir2_block_log_leaf( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, /* block buffer */ int first, /* index of first logged leaf */ int last) /* index of last logged leaf */ { - xfs_dir2_block_t *block; /* directory block structure */ - xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_leaf_entry_t *blp; + xfs_dir2_block_tail_t *btp; - mp = tp->t_mountp; - block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); - xfs_da_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)block), - (uint)((char *)&blp[last + 1] - (char *)block - 1)); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr), + (uint)((char *)&blp[last + 1] - (char *)hdr - 1)); } /* @@ -583,17 +592,14 @@ xfs_dir2_block_log_leaf( static void xfs_dir2_block_log_tail( xfs_trans_t *tp, /* transaction structure */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_buf *bp) /* block buffer */ { - xfs_dir2_block_t *block; /* directory block structure */ - xfs_dir2_block_tail_t *btp; /* block tail */ - xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + xfs_dir2_block_tail_t *btp; - mp = tp->t_mountp; - block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - xfs_da_log_buf(tp, bp, (uint)((char *)btp - (char *)block), - (uint)((char *)(btp + 1) - (char *)block - 1)); + btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr); + xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr), + (uint)((char *)(btp + 1) - (char *)hdr - 1)); } /* @@ -604,9 +610,9 @@ int /* error */ xfs_dir2_block_lookup( xfs_da_args_t *args) /* dir lookup arguments */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -614,7 +620,8 @@ xfs_dir2_block_lookup( int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_trace_args("block_lookup", args); + trace_xfs_dir2_block_lookup(args); + /* * Get the buffer, look up the entry. * If not found (ENOENT) then return, have no buffer. @@ -623,21 +630,24 @@ xfs_dir2_block_lookup( return error; dp = args->dp; mp = dp->i_mount; - block = bp->data; - xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); /* * Get the offset from the leaf entry, to point to the data. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* - * Fill in inode number, release the block. + * Fill in inode number, CI name if appropriate, release the block. */ args->inumber = be64_to_cpu(dep->inumber); - xfs_da_brelse(args->trans, bp); - return XFS_ERROR(EEXIST); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(args->trans, bp); + return XFS_ERROR(error); } /* @@ -646,13 +656,13 @@ xfs_dir2_block_lookup( static int /* error */ xfs_dir2_block_lookup_int( xfs_da_args_t *args, /* dir lookup arguments */ - xfs_dabuf_t **bpp, /* returned block buffer */ + struct xfs_buf **bpp, /* returned block buffer */ int *entno) /* returned entry number */ { xfs_dir2_dataptr_t addr; /* data entry address */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -663,22 +673,20 @@ xfs_dir2_block_lookup_int( int mid; /* binary search current idx */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the buffer, return error if we can't get it. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) { + + error = xfs_dir3_block_read(tp, dp, &bp); + if (error) return error; - } - ASSERT(bp != NULL); - block = bp->data; - xfs_dir2_data_check(dp, bp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); /* * Loop doing a binary search for our hash value. * Find our entry, ENOENT if it's not there. @@ -693,8 +701,8 @@ xfs_dir2_block_lookup_int( else high = mid - 1; if (low > high) { - ASSERT(args->oknoent); - xfs_da_brelse(tp, bp); + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } } @@ -715,23 +723,34 @@ xfs_dir2_block_lookup_int( * Get pointer to the entry from the leaf. */ dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr)); /* - * Compare, if it's right give back buffer & entry number. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *bpp = bp; *entno = mid; - return 0; + if (cmp == XFS_CMP_EXACT) + return 0; } - } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash); + } while (++mid < be32_to_cpu(btp->count) && + be32_to_cpu(blp[mid].hashval) == hash); + + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was found earlier, return success. + */ + if (args->cmpresult == XFS_CMP_CASE) + return 0; /* * No match, release the buffer and return ENOENT. */ - ASSERT(args->oknoent); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return XFS_ERROR(ENOENT); } @@ -743,9 +762,9 @@ int /* error */ xfs_dir2_block_removename( xfs_da_args_t *args) /* directory operation args */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf pointer */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -758,7 +777,8 @@ xfs_dir2_block_removename( int size; /* shortform size */ xfs_trans_t *tp; /* transaction pointer */ - xfs_dir2_trace_args("block_removename", args); + trace_xfs_dir2_block_removename(args); + /* * Look up the entry in the block. Gets the buffer and entry index. * It will always be there, the vnodeops level does a lookup first. @@ -769,25 +789,26 @@ xfs_dir2_block_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry using the leaf entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); /* * Mark the data entry's space free. */ needlog = needscan = 0; - xfs_dir2_data_make_free(tp, bp, - (xfs_dir2_data_aoff_t)((char *)dep - (char *)block), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_make_free(args, bp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Fix up the block tail. */ - be32_add(&btp->stale, 1); + be32_add_cpu(&btp->stale, 1); xfs_dir2_block_log_tail(tp, bp); /* * Remove the leaf entry by marking it stale. @@ -798,19 +819,17 @@ xfs_dir2_block_removename( * Fix up bestfree, log the header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_check(dp, bp); + xfs_dir2_data_log_header(args, bp); + xfs_dir3_data_check(dp, bp); /* * See if the size as a shortform is good enough. */ - if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > - XFS_IFORK_DSIZE(dp)) { - xfs_da_buf_done(bp); + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) return 0; - } + /* * If it works, do the conversion. */ @@ -825,9 +844,9 @@ int /* error */ xfs_dir2_block_replace( xfs_da_args_t *args) /* directory operation args */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* block data entry */ xfs_inode_t *dp; /* incore inode */ @@ -835,7 +854,8 @@ xfs_dir2_block_replace( int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_trace_args("block_replace", args); + trace_xfs_dir2_block_replace(args); + /* * Lookup the entry in the directory. Get buffer and entry index. * This will always succeed since the caller has already done a lookup. @@ -845,22 +865,23 @@ xfs_dir2_block_replace( } dp = args->dp; mp = dp->i_mount; - block = bp->data; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + hdr = bp->b_addr; + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); /* * Point to the data entry we need to change. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(blp[ent].address))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(blp[ent].address))); ASSERT(be64_to_cpu(dep->inumber) != args->inumber); /* * Change the inode number to the new value. */ dep->inumber = cpu_to_be64(args->inumber); - xfs_dir2_data_log_entry(args->trans, bp, dep); - xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); + dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, bp, dep); + xfs_dir3_data_check(dp, bp); return 0; } @@ -887,11 +908,11 @@ xfs_dir2_block_sort( int /* error */ xfs_dir2_leaf_to_block( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ - xfs_dabuf_t *dbp) /* data buffer */ + struct xfs_buf *lbp, /* leaf buffer */ + struct xfs_buf *dbp) /* data buffer */ { __be16 *bestsp; /* leaf bests table */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused data entry */ @@ -908,87 +929,97 @@ xfs_dir2_leaf_to_block( __be16 *tagp; /* end of entry (tag) */ int to; /* block/leaf to index */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_to_block(args); - xfs_dir2_trace_args_bb("leaf_to_block", args, lbp, dbp); dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC || + leafhdr.magic == XFS_DIR3_LEAF1_MAGIC); /* * If there are data blocks other than the first one, take this * opportunity to remove trailing empty data blocks that may have * been left behind during no-space-reservation operations. * These will show up in the leaf bests table. */ - while (dp->i_d.di_size > mp->m_dirblksize) { - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + while (dp->i_d.di_size > args->geo->blksize) { + int hdrsz; + + hdrsz = dp->d_ops->data_entry_offset; + bestsp = xfs_dir2_leaf_bests_p(ltp); if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) == - mp->m_dirblksize - (uint)sizeof(block->hdr)) { + args->geo->blksize - hdrsz) { if ((error = xfs_dir2_leaf_trim_data(args, lbp, (xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1)))) - goto out; - } else { - error = 0; - goto out; - } + return error; + } else + return 0; } /* * Read the data block if we don't already have it, give up if it fails. */ - if (dbp == NULL && - (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp, - XFS_DATA_FORK))) { - goto out; + if (!dbp) { + error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp); + if (error) + return error; } - block = dbp->data; - ASSERT(be32_to_cpu(block->hdr.magic) == XFS_DIR2_DATA_MAGIC); + hdr = dbp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + /* * Size of the "leaf" area in the block. */ - size = (uint)sizeof(block->tail) + - (uint)sizeof(*lep) * (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + size = (uint)sizeof(xfs_dir2_block_tail_t) + + (uint)sizeof(*lep) * (leafhdr.count - leafhdr.stale); /* * Look at the last data entry. */ - tagp = (__be16 *)((char *)block + mp->m_dirblksize) - 1; - dup = (xfs_dir2_data_unused_t *)((char *)block + be16_to_cpu(*tagp)); + tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1; + dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); /* * If it's not free or is too short we can't do it. */ if (be16_to_cpu(dup->freetag) != XFS_DIR2_DATA_FREE_TAG || - be16_to_cpu(dup->length) < size) { - error = 0; - goto out; - } + be16_to_cpu(dup->length) < size) + return 0; + /* * Start converting it to block form. */ - block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, dbp, dp); + needlog = 1; needscan = 0; /* * Use up the space at the end of the block (blp/btp). */ - xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size, + xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size, &needlog, &needscan); /* * Initialize the block tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - btp->count = cpu_to_be32(be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale); btp->stale = 0; xfs_dir2_block_log_tail(tp, dbp); /* * Initialize the block leaf area. We compact out stale entries. */ - lep = XFS_DIR2_BLOCK_LEAF_P(btp); - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { - if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + lep = xfs_dir2_block_leaf_p(btp); + for (from = to = 0; from < leafhdr.count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; - lep[to++] = leaf->ents[from]; + lep[to++] = ents[from]; } ASSERT(to == be32_to_cpu(btp->count)); xfs_dir2_block_log_leaf(tp, dbp, 0, be32_to_cpu(btp->count) - 1); @@ -996,33 +1027,24 @@ xfs_dir2_leaf_to_block( * Scan the bestfree if we need it and log the data block header. */ if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * Pitch the old leaf block. */ - error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp); - lbp = NULL; - if (error) { - goto out; - } + error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp); + if (error) + return error; + /* * Now see if the resulting block can be shrunken to shortform. */ - if ((size = xfs_dir2_block_sfsize(dp, block, &sfh)) > - XFS_IFORK_DSIZE(dp)) { - error = 0; - goto out; - } + size = xfs_dir2_block_sfsize(dp, hdr, &sfh); + if (size > XFS_IFORK_DSIZE(dp)) + return 0; + return xfs_dir2_block_to_sf(args, dbp, size, &sfh); -out: - if (lbp) - xfs_da_buf_done(lbp); - if (dbp) - xfs_da_buf_done(dbp); - return error; } /* @@ -1033,12 +1055,10 @@ xfs_dir2_sf_to_block( xfs_da_args_t *args) /* operation arguments */ { xfs_dir2_db_t blkno; /* dir-relative block # (0) */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block leaf entries */ - xfs_dabuf_t *bp; /* block buffer */ + struct xfs_buf *bp; /* block buffer */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ - char *buf; /* sf buffer */ - int buf_len; xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ int dummy; /* trash */ @@ -1052,15 +1072,20 @@ xfs_dir2_sf_to_block( int newoffset; /* offset from current entry */ int offset; /* target block offset */ xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */ + xfs_dir2_sf_hdr_t *sfp; /* shortform header */ __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_name name; + struct xfs_ifork *ifp; + + trace_xfs_dir2_sf_to_block(args); - xfs_dir2_trace_args("sf_to_block", args); dp = args->dp; tp = args->trans; mp = dp->i_mount; - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK); + ASSERT(ifp->if_flags & XFS_IFINLINE); /* * Bomb out if the shortform directory is way too short. */ @@ -1068,108 +1093,109 @@ xfs_dir2_sf_to_block( ASSERT(XFS_FORCED_SHUTDOWN(mp)); return XFS_ERROR(EIO); } - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + + oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; + + ASSERT(ifp->if_bytes == dp->i_d.di_size); + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count)); + ASSERT(dp->i_d.di_nextents == 0); + /* - * Copy the directory into the stack buffer. + * Copy the directory into a temporary buffer. * Then pitch the incore inode data so we can make extents. */ + sfp = kmem_alloc(ifp->if_bytes, KM_SLEEP); + memcpy(sfp, oldsfp, ifp->if_bytes); - buf_len = dp->i_df.if_bytes; - buf = kmem_alloc(dp->i_df.if_bytes, KM_SLEEP); - - memcpy(buf, sfp, dp->i_df.if_bytes); - xfs_idata_realloc(dp, -dp->i_df.if_bytes, XFS_DATA_FORK); + xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); dp->i_d.di_size = 0; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - /* - * Reset pointer - old sfp is gone. - */ - sfp = (xfs_dir2_sf_t *)buf; + /* * Add block 0 to the inode. */ error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno); if (error) { - kmem_free(buf, buf_len); + kmem_free(sfp); return error; } /* - * Initialize the data block. + * Initialize the data block, then convert it to block format. */ - error = xfs_dir2_data_init(args, blkno, &bp); + error = xfs_dir3_data_init(args, blkno, &bp); if (error) { - kmem_free(buf, buf_len); + kmem_free(sfp); return error; } - block = bp->data; - block->hdr.magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC); + xfs_dir3_block_init(mp, tp, bp, dp); + hdr = bp->b_addr; + /* * Compute size of block "tail" area. */ i = (uint)sizeof(*btp) + - (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t); /* * The whole thing is initialized to free by the init routine. * Say we're using the leaf and tail area. */ - dup = (xfs_dir2_data_unused_t *)block->u; + dup = dp->d_ops->data_unused_p(hdr); needlog = needscan = 0; - xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog, - &needscan); + xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i, + i, &needlog, &needscan); ASSERT(needscan == 0); /* * Fill in the tail. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - btp->count = cpu_to_be32(sfp->hdr.count + 2); /* ., .. */ + btp = xfs_dir2_block_tail_p(args->geo, hdr); + btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */ btp->stale = 0; - blp = XFS_DIR2_BLOCK_LEAF_P(btp); - endoffset = (uint)((char *)blp - (char *)block); + blp = xfs_dir2_block_leaf_p(btp); + endoffset = (uint)((char *)blp - (char *)hdr); /* * Remove the freespace, we'll manage it. */ - xfs_dir2_data_use_free(tp, bp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)block), + xfs_dir2_data_use_free(args, bp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), be16_to_cpu(dup->length), &needlog, &needscan); /* * Create entry for . */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATA_DOT_OFFSET); + dep = dp->d_ops->data_dot_entry_p(hdr); dep->inumber = cpu_to_be64(dp->i_ino); dep->namelen = 1; dep->name[0] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); - xfs_dir2_data_log_entry(tp, bp, dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot); - blp[0].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, - (char *)dep - (char *)block)); + blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); /* * Create entry for .. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATA_DOTDOT_OFFSET); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + dep = dp->d_ops->data_dotdot_entry_p(hdr); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp)); dep->namelen = 2; dep->name[0] = dep->name[1] = '.'; - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); - xfs_dir2_data_log_entry(tp, bp, dep); + dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot); - blp[1].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, - (char *)dep - (char *)block)); - offset = XFS_DIR2_DATA_FIRST_OFFSET; + blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = dp->d_ops->data_first_offset; /* * Loop over existing entries, stuff them in. */ - if ((i = 0) == sfp->hdr.count) + i = 0; + if (!sfp->count) sfep = NULL; else - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Need to preserve the existing offset values in the sf directory. * Insert holes (unused entries) where necessary. @@ -1181,46 +1207,48 @@ xfs_dir2_sf_to_block( if (sfep == NULL) newoffset = endoffset; else - newoffset = XFS_DIR2_SF_GET_OFFSET(sfep); + newoffset = xfs_dir2_sf_get_offset(sfep); /* * There should be a hole here, make one. */ if (offset < newoffset) { - dup = (xfs_dir2_data_unused_t *) - ((char *)block + offset); + dup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); dup->length = cpu_to_be16(newoffset - offset); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16( - ((char *)dup - (char *)block)); - xfs_dir2_data_log_unused(tp, bp, dup); - (void)xfs_dir2_data_freeinsert((xfs_dir2_data_t *)block, - dup, &dummy); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16( + ((char *)dup - (char *)hdr)); + xfs_dir2_data_log_unused(args, bp, dup); + xfs_dir2_data_freeinsert(hdr, + dp->d_ops->data_bestfree_p(hdr), + dup, &dummy); offset += be16_to_cpu(dup->length); continue; } /* * Copy a real entry. */ - dep = (xfs_dir2_data_entry_t *)((char *)block + newoffset); - dep->inumber = cpu_to_be64(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep))); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset); + dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep)); dep->namelen = sfep->namelen; + dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep)); memcpy(dep->name, sfep->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)block); - xfs_dir2_data_log_entry(tp, bp, dep); - blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname( - (char *)sfep->name, sfep->namelen)); - blp[2 + i].address = cpu_to_be32(XFS_DIR2_BYTE_TO_DATAPTR(mp, - (char *)dep - (char *)block)); - offset = (int)((char *)(tagp + 1) - (char *)block); - if (++i == sfp->hdr.count) + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, bp, dep); + name.name = sfep->name; + name.len = sfep->namelen; + blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops-> + hashname(&name)); + blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr( + (char *)dep - (char *)hdr)); + offset = (int)((char *)(tagp + 1) - (char *)hdr); + if (++i == sfp->count) sfep = NULL; else - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* Done with the temporary buffer */ - kmem_free(buf, buf_len); + kmem_free(sfp); /* * Sort the leaf entries by hash value. */ @@ -1232,7 +1260,6 @@ xfs_dir2_sf_to_block( ASSERT(needscan == 0); xfs_dir2_block_log_leaf(tp, bp, 0, be32_to_cpu(btp->count) - 1); xfs_dir2_block_log_tail(tp, bp); - xfs_dir2_data_check(dp, bp); - xfs_da_buf_done(bp); + xfs_dir3_data_check(dp, bp); return 0; } diff --git a/fs/xfs/xfs_dir2_block.h b/fs/xfs/xfs_dir2_block.h deleted file mode 100644 index 6722effd0b2..00000000000 --- a/fs/xfs/xfs_dir2_block.h +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_BLOCK_H__ -#define __XFS_DIR2_BLOCK_H__ - -/* - * xfs_dir2_block.h - * Directory version 2, single block format structures - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_data_hdr; -struct xfs_dir2_leaf_entry; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * The single block format is as follows: - * xfs_dir2_data_hdr_t structure - * xfs_dir2_data_entry_t and xfs_dir2_data_unused_t structures - * xfs_dir2_leaf_entry_t structures - * xfs_dir2_block_tail_t structure - */ - -#define XFS_DIR2_BLOCK_MAGIC 0x58443242 /* XD2B: for one block dirs */ - -typedef struct xfs_dir2_block_tail { - __be32 count; /* count of leaf entries */ - __be32 stale; /* count of stale lf entries */ -} xfs_dir2_block_tail_t; - -/* - * Generic single-block structure, for xfs_db. - */ -typedef struct xfs_dir2_block { - xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_BLOCK_MAGIC */ - xfs_dir2_data_union_t u[1]; - xfs_dir2_leaf_entry_t leaf[1]; - xfs_dir2_block_tail_t tail; -} xfs_dir2_block_t; - -/* - * Pointer to the leaf header embedded in a data block (1-block format) - */ -#define XFS_DIR2_BLOCK_TAIL_P(mp,block) xfs_dir2_block_tail_p(mp,block) -static inline xfs_dir2_block_tail_t * -xfs_dir2_block_tail_p(struct xfs_mount *mp, xfs_dir2_block_t *block) -{ - return (((xfs_dir2_block_tail_t *) - ((char *)(block) + (mp)->m_dirblksize)) - 1); -} - -/* - * Pointer to the leaf entries embedded in a data block (1-block format) - */ -#define XFS_DIR2_BLOCK_LEAF_P(btp) xfs_dir2_block_leaf_p(btp) -static inline struct xfs_dir2_leaf_entry * -xfs_dir2_block_leaf_p(xfs_dir2_block_tail_t *btp) -{ - return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count); -} - -/* - * Function declarations. - */ -extern int xfs_dir2_block_addname(struct xfs_da_args *args); -extern int xfs_dir2_block_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - struct uio *uio, int *eofp, - struct xfs_dirent *dbp, xfs_dir2_put_t put); -extern int xfs_dir2_block_lookup(struct xfs_da_args *args); -extern int xfs_dir2_block_removename(struct xfs_da_args *args); -extern int xfs_dir2_block_replace(struct xfs_da_args *args); -extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); -extern int xfs_dir2_sf_to_block(struct xfs_da_args *args); - -#endif /* __XFS_DIR2_BLOCK_H__ */ diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c index f7c79921707..8c2f6422648 100644 --- a/fs/xfs/xfs_dir2_data.c +++ b/fs/xfs/xfs_dir2_data.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,41 +18,37 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" -#ifdef DEBUG /* * Check the consistency of the data block. * The input can also be a block-format directory. - * Pop an assert if we find anything bad. + * Return 0 is the buffer is good, otherwise an error. */ -void -xfs_dir2_data_check( - xfs_inode_t *dp, /* incore inode pointer */ - xfs_dabuf_t *bp) /* data block's buffer */ +int +__xfs_dir3_data_check( + struct xfs_inode *dp, /* incore inode pointer */ + struct xfs_buf *bp) /* data block's buffer */ { xfs_dir2_dataptr_t addr; /* addr for leaf lookup */ xfs_dir2_data_free_t *bf; /* bestfree table */ xfs_dir2_block_tail_t *btp=NULL; /* block tail */ int count; /* count of entries found */ - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_unused_t *dup; /* unused entry */ @@ -64,37 +61,70 @@ xfs_dir2_data_check( xfs_mount_t *mp; /* filesystem mount point */ char *p; /* current data position */ int stale; /* count of stale leaves */ + struct xfs_name name; + const struct xfs_dir_ops *ops; + struct xfs_da_geometry *geo; - mp = dp->i_mount; - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - bf = d->hdr.bestfree; - p = (char *)d->u; - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - lep = XFS_DIR2_BLOCK_LEAF_P(btp); + mp = bp->b_target->bt_mount; + geo = mp->m_dir_geo; + + /* + * We can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + hdr = bp->b_addr; + p = (char *)ops->data_entry_p(hdr); + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + btp = xfs_dir2_block_tail_p(geo, hdr); + lep = xfs_dir2_block_leaf_p(btp); endp = (char *)lep; - } else - endp = (char *)d + mp->m_dirblksize; - count = lastfree = freeseen = 0; + + /* + * The number of leaf entries is limited by the size of the + * block and the amount of space used by the data entries. + * We don't know how much space is used by the data entries yet, + * so just ensure that the count falls somewhere inside the + * block right now. + */ + XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); + break; + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + endp = (char *)hdr + geo->blksize; + break; + default: + XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); + return EFSCORRUPTED; + } + /* * Account for zero bestfree entries. */ + bf = ops->data_bestfree_p(hdr); + count = lastfree = freeseen = 0; if (!bf[0].length) { - ASSERT(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - ASSERT(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - ASSERT(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); freeseen |= 1 << 2; } - ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); + + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + be16_to_cpu(bf[1].length)); + XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. */ @@ -106,17 +136,20 @@ xfs_dir2_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT(lastfree == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup)) == - (char *)dup - (char *)d); - dfp = xfs_dir2_data_freefind(d, dup); + XFS_WANT_CORRUPTED_RETURN(lastfree == 0); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == + (char *)dup - (char *)hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - ASSERT((freeseen & (1 << i)) == 0); + XFS_WANT_CORRUPTED_RETURN( + (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - ASSERT(be16_to_cpu(dup->length) <= - be16_to_cpu(bf[2].length)); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(dup->length) <= + be16_to_cpu(bf[2].length)); } p += be16_to_cpu(dup->length); lastfree = 1; @@ -129,42 +162,182 @@ xfs_dir2_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - ASSERT(dep->namelen != 0); - ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0); - ASSERT(be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep)) == - (char *)dep - (char *)d); + XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN( + !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN( + be16_to_cpu(*ops->data_entry_tag_p(dep)) == + (char *)dep - (char *)hdr); + XFS_WANT_CORRUPTED_RETURN( + ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - addr = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)d)); - hash = xfs_da_hashname((char *)dep->name, dep->namelen); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (xfs_dir2_data_aoff_t) + ((char *)dep - (char *)hdr)); + name.name = dep->name; + name.len = dep->namelen; + hash = mp->m_dirnameops->hashname(&name); for (i = 0; i < be32_to_cpu(btp->count); i++) { if (be32_to_cpu(lep[i].address) == addr && be32_to_cpu(lep[i].hashval) == hash) break; } - ASSERT(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); } - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - ASSERT(freeseen == 7); - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { + XFS_WANT_CORRUPTED_RETURN(freeseen == 7); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { - if (be32_to_cpu(lep[i].address) == XFS_DIR2_NULL_DATAPTR) + if (lep[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); + XFS_WANT_CORRUPTED_RETURN( + be32_to_cpu(lep[i].hashval) >= + be32_to_cpu(lep[i - 1].hashval)); } - ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - ASSERT(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(count == + be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); } + return 0; +} + +static bool +xfs_dir3_data_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (hdr3->magic != cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) + return false; + } + if (__xfs_dir3_data_check(NULL, bp)) + return false; + return true; +} + +/* + * Readahead of the first block of the directory when it is opened is completely + * oblivious to the format of the directory. Hence we can either get a block + * format buffer or a data format buffer on readahead. + */ +static void +xfs_dir3_data_reada_verify( + struct xfs_buf *bp) +{ + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + switch (hdr->magic) { + case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC): + case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC): + bp->b_ops = &xfs_dir3_block_buf_ops; + bp->b_ops->verify_read(bp); + return; + case cpu_to_be32(XFS_DIR2_DATA_MAGIC): + case cpu_to_be32(XFS_DIR3_DATA_MAGIC): + xfs_dir3_data_verify(bp); + return; + default: + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + break; + } +} + +static void +xfs_dir3_data_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_data_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_data_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_data_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .verify_read = xfs_dir3_data_read_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + +static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .verify_read = xfs_dir3_data_reada_verify, + .verify_write = xfs_dir3_data_write_verify, +}; + + +int +xfs_dir3_data_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, + XFS_DATA_FORK, &xfs_dir3_data_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); + return err; +} + +int +xfs_dir3_data_readahead( + struct xfs_inode *dp, + xfs_dablk_t bno, + xfs_daddr_t mapped_bno) +{ + return xfs_da_reada_buf(dp, bno, mapped_bno, + XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops); } -#endif /* * Given a data block and an unused entry from that block, @@ -172,27 +345,31 @@ xfs_dir2_data_check( */ xfs_dir2_data_free_t * xfs_dir2_data_freefind( - xfs_dir2_data_t *d, /* data block */ - xfs_dir2_data_unused_t *dup) /* data unused entry */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup) /* unused space */ { xfs_dir2_data_free_t *dfp; /* bestfree entry */ xfs_dir2_data_aoff_t off; /* offset value needed */ -#if defined(DEBUG) && defined(__KERNEL__) +#ifdef DEBUG int matched; /* matched the value */ int seenzero; /* saw a 0 bestfree entry */ #endif - off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)d); -#if defined(DEBUG) && defined(__KERNEL__) + off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr); + +#ifdef DEBUG /* * Validate some consistency in the bestfree table. * Check order, non-overlapping entries, and if we find the * one we're looking for it has to be exact. */ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - for (dfp = &d->hdr.bestfree[0], seenzero = matched = 0; - dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + for (dfp = &bf[0], seenzero = matched = 0; + dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) { ASSERT(!dfp->length); @@ -208,7 +385,7 @@ xfs_dir2_data_freefind( else ASSERT(be16_to_cpu(dfp->offset) + be16_to_cpu(dfp->length) <= off); ASSERT(matched || be16_to_cpu(dfp->length) >= be16_to_cpu(dup->length)); - if (dfp > &d->hdr.bestfree[0]) + if (dfp > &bf[0]) ASSERT(be16_to_cpu(dfp[-1].length) >= be16_to_cpu(dfp[0].length)); } #endif @@ -217,14 +394,12 @@ xfs_dir2_data_freefind( * it can't be there since they're sorted. */ if (be16_to_cpu(dup->length) < - be16_to_cpu(d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT - 1].length)) + be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length)) return NULL; /* * Look at the three bestfree entries for our guy. */ - for (dfp = &d->hdr.bestfree[0]; - dfp < &d->hdr.bestfree[XFS_DIR2_DATA_FD_COUNT]; - dfp++) { + for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) { if (!dfp->offset) return NULL; if (be16_to_cpu(dfp->offset) == off) @@ -241,20 +416,21 @@ xfs_dir2_data_freefind( */ xfs_dir2_data_free_t * /* entry inserted */ xfs_dir2_data_freeinsert( - xfs_dir2_data_t *d, /* data block pointer */ - xfs_dir2_data_unused_t *dup, /* unused space */ + struct xfs_dir2_data_hdr *hdr, /* data block pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree table pointer */ + struct xfs_dir2_data_unused *dup, /* unused space */ int *loghead) /* log the data header (out) */ { - xfs_dir2_data_free_t *dfp; /* bestfree table pointer */ xfs_dir2_data_free_t new; /* new bestfree entry */ -#ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); -#endif - dfp = d->hdr.bestfree; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + new.length = dup->length; - new.offset = cpu_to_be16((char *)dup - (char *)d); + new.offset = cpu_to_be16((char *)dup - (char *)hdr); + /* * Insert at position 0, 1, or 2; or not at all. */ @@ -284,36 +460,39 @@ xfs_dir2_data_freeinsert( */ STATIC void xfs_dir2_data_freeremove( - xfs_dir2_data_t *d, /* data block pointer */ - xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */ + struct xfs_dir2_data_hdr *hdr, /* data block header */ + struct xfs_dir2_data_free *bf, /* bestfree table pointer */ + struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */ int *loghead) /* out: log data header */ { -#ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); -#endif + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + /* * It's the first entry, slide the next 2 up. */ - if (dfp == &d->hdr.bestfree[0]) { - d->hdr.bestfree[0] = d->hdr.bestfree[1]; - d->hdr.bestfree[1] = d->hdr.bestfree[2]; + if (dfp == &bf[0]) { + bf[0] = bf[1]; + bf[1] = bf[2]; } /* * It's the second entry, slide the 3rd entry up. */ - else if (dfp == &d->hdr.bestfree[1]) - d->hdr.bestfree[1] = d->hdr.bestfree[2]; + else if (dfp == &bf[1]) + bf[1] = bf[2]; /* * Must be the last entry. */ else - ASSERT(dfp == &d->hdr.bestfree[2]); + ASSERT(dfp == &bf[2]); /* * Clear the 3rd entry, must be zero now. */ - d->hdr.bestfree[2].length = 0; - d->hdr.bestfree[2].offset = 0; + bf[2].length = 0; + bf[2].offset = 0; *loghead = 1; } @@ -322,37 +501,39 @@ xfs_dir2_data_freeremove( */ void xfs_dir2_data_freescan( - xfs_mount_t *mp, /* filesystem mount point */ - xfs_dir2_data_t *d, /* data block pointer */ - int *loghead, /* out: log data header */ - char *aendp) /* in: caller's endp */ + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) { xfs_dir2_block_tail_t *btp; /* block tail */ xfs_dir2_data_entry_t *dep; /* active data entry */ xfs_dir2_data_unused_t *dup; /* unused data entry */ + struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ + struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); -#ifdef __KERNEL__ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); -#endif /* * Start by clearing the table. */ - memset(d->hdr.bestfree, 0, sizeof(d->hdr.bestfree)); + bf = dp->d_ops->data_bestfree_p(hdr); + memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)d->u; - if (aendp) - endp = aendp; - else if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC) { - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endp = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + p = (char *)dp->d_ops->data_entry_p(hdr); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { + btp = xfs_dir2_block_tail_p(geo, hdr); + endp = (char *)xfs_dir2_block_leaf_p(btp); } else - endp = (char *)d + mp->m_dirblksize; + endp = (char *)hdr + geo->blksize; /* * Loop over the block's entries. */ @@ -362,9 +543,9 @@ xfs_dir2_data_freescan( * If it's a free entry, insert it. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - ASSERT((char *)dup - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); - xfs_dir2_data_freeinsert(d, dup, loghead); + ASSERT((char *)dup - (char *)hdr == + be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); + xfs_dir2_data_freeinsert(hdr, bf, dup, loghead); p += be16_to_cpu(dup->length); } /* @@ -372,9 +553,9 @@ xfs_dir2_data_freescan( */ else { dep = (xfs_dir2_data_entry_t *)p; - ASSERT((char *)dep - (char *)d == - be16_to_cpu(*XFS_DIR2_DATA_ENTRY_TAG_P(dep))); - p += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ASSERT((char *)dep - (char *)hdr == + be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); + p += dp->d_ops->data_entsize(dep->namelen); } } } @@ -384,15 +565,16 @@ xfs_dir2_data_freescan( * Give back the buffer for the created block. */ int /* error */ -xfs_dir2_data_init( +xfs_dir3_data_init( xfs_da_args_t *args, /* directory operation args */ xfs_dir2_db_t blkno, /* logical dir block number */ - xfs_dabuf_t **bpp) /* output block buffer */ + struct xfs_buf **bpp) /* output block buffer */ { - xfs_dabuf_t *bp; /* block buffer */ - xfs_dir2_data_t *d; /* pointer to block */ + struct xfs_buf *bp; /* block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* unused entry pointer */ + struct xfs_dir2_data_free *bf; int error; /* error return value */ int i; /* bestfree index */ xfs_mount_t *mp; /* filesystem mount point */ @@ -405,37 +587,51 @@ xfs_dir2_data_init( /* * Get the buffer set up for the block. */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, blkno), -1, &bp, - XFS_DATA_FORK); - if (error) { + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno), + -1, &bp, XFS_DATA_FORK); + if (error) return error; - } - ASSERT(bp != NULL); + bp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_DATA_BUF); + /* * Initialize the header. */ - d = bp->data; - d->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); - d->hdr.bestfree[0].offset = cpu_to_be16(sizeof(d->hdr)); + hdr = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + memset(hdr3, 0, sizeof(*hdr3)); + hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->uuid, &mp->m_sb.sb_uuid); + + } else + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + + bf = dp->d_ops->data_bestfree_p(hdr); + bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset); for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) { - d->hdr.bestfree[i].length = 0; - d->hdr.bestfree[i].offset = 0; + bf[i].length = 0; + bf[i].offset = 0; } + /* * Set up an unused entry for the block's body. */ - dup = &d->u[0].unused; + dup = dp->d_ops->data_unused_p(hdr); dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); - t=mp->m_dirblksize - (uint)sizeof(d->hdr); - d->hdr.bestfree[0].length = cpu_to_be16(t); + t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset; + bf[0].length = cpu_to_be16(t); dup->length = cpu_to_be16(t); - *XFS_DIR2_DATA_UNUSED_TAG_P(dup) = cpu_to_be16((char *)dup - (char *)d); + *xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr); /* * Log it and return it. */ - xfs_dir2_data_log_header(tp, bp); - xfs_dir2_data_log_unused(tp, bp, dup); + xfs_dir2_data_log_header(args, bp); + xfs_dir2_data_log_unused(args, bp, dup); *bpp = bp; return 0; } @@ -445,18 +641,20 @@ xfs_dir2_data_init( */ void xfs_dir2_data_log_entry( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_entry_t *dep) /* data entry pointer */ { - xfs_dir2_data_t *d; /* data block pointer */ - - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)dep - (char *)d), - (uint)((char *)(XFS_DIR2_DATA_ENTRY_TAG_P(dep) + 1) - - (char *)d - 1)); + struct xfs_dir2_data_hdr *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr), + (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) - + (char *)hdr - 1)); } /* @@ -464,16 +662,20 @@ xfs_dir2_data_log_entry( */ void xfs_dir2_data_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* block buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_dir2_data_t *d; /* data block pointer */ +#ifdef DEBUG + struct xfs_dir2_data_hdr *hdr = bp->b_addr; - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)&d->hdr - (char *)d), - (uint)(sizeof(d->hdr) - 1)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); +#endif + + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->data_entry_offset - 1); } /* @@ -481,27 +683,29 @@ xfs_dir2_data_log_header( */ void xfs_dir2_data_log_unused( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup) /* data unused pointer */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr = bp->b_addr; + + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); /* * Log the first part of the unused entry. */ - xfs_da_log_buf(tp, bp, (uint)((char *)dup - (char *)d), + xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr), (uint)((char *)&dup->length + sizeof(dup->length) - - 1 - (char *)d)); + 1 - (char *)hdr)); /* * Log the end (tag) of the unused entry. */ - xfs_da_log_buf(tp, bp, - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d), - (uint)((char *)XFS_DIR2_DATA_UNUSED_TAG_P(dup) - (char *)d + + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr), + (uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr + sizeof(xfs_dir2_data_off_t) - 1)); } @@ -511,45 +715,47 @@ xfs_dir2_data_log_unused( */ void xfs_dir2_data_make_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_aoff_t offset, /* starting byte offset */ xfs_dir2_data_aoff_t len, /* length in bytes */ int *needlogp, /* out: log header */ int *needscanp) /* out: regen bestfree */ { - xfs_dir2_data_t *d; /* data block pointer */ + xfs_dir2_data_hdr_t *hdr; /* data block pointer */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ char *endptr; /* end of data area */ - xfs_mount_t *mp; /* filesystem mount point */ int needscan; /* need to regen bestfree */ xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *postdup; /* unused entry after us */ xfs_dir2_data_unused_t *prevdup; /* unused entry before us */ + struct xfs_dir2_data_free *bf; + + hdr = bp->b_addr; - mp = tp->t_mountp; - d = bp->data; /* * Figure out where the end of the data area is. */ - if (be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC) - endptr = (char *)d + mp->m_dirblksize; + if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)) + endptr = (char *)hdr + args->geo->blksize; else { xfs_dir2_block_tail_t *btp; /* block tail */ - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, (xfs_dir2_block_t *)d); - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); } /* * If this isn't the start of the block, then back up to * the previous entry and see if it's free. */ - if (offset > sizeof(d->hdr)) { + if (offset > args->dp->d_ops->data_entry_offset) { __be16 *tagp; /* tag just before us */ - tagp = (__be16 *)((char *)d + offset) - 1; - prevdup = (xfs_dir2_data_unused_t *)((char *)d + be16_to_cpu(*tagp)); + tagp = (__be16 *)((char *)hdr + offset) - 1; + prevdup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp)); if (be16_to_cpu(prevdup->freetag) != XFS_DIR2_DATA_FREE_TAG) prevdup = NULL; } else @@ -558,9 +764,9 @@ xfs_dir2_data_make_free( * If this isn't the end of the block, see if the entry after * us is free. */ - if ((char *)d + offset + len < endptr) { + if ((char *)hdr + offset + len < endptr) { postdup = - (xfs_dir2_data_unused_t *)((char *)d + offset + len); + (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); if (be16_to_cpu(postdup->freetag) != XFS_DIR2_DATA_FREE_TAG) postdup = NULL; } else @@ -571,28 +777,29 @@ xfs_dir2_data_make_free( * Previous and following entries are both free, * merge everything into a single free entry. */ + bf = args->dp->d_ops->data_bestfree_p(hdr); if (prevdup && postdup) { xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */ /* * See if prevdup and/or postdup are in bestfree table. */ - dfp = xfs_dir2_data_freefind(d, prevdup); - dfp2 = xfs_dir2_data_freefind(d, postdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup); /* * We need a rescan unless there are exactly 2 free entries * namely our two. Then we know what's happening, otherwise * since the third bestfree is there, there might be more * entries. */ - needscan = (d->hdr.bestfree[2].length != 0); + needscan = (bf[2].length != 0); /* * Fix up the new big freespace. */ - be16_add(&prevdup->length, len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = - cpu_to_be16((char *)prevdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, prevdup); + be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length)); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); if (!needscan) { /* * Has to be the case that entries 0 and 1 are @@ -601,18 +808,19 @@ xfs_dir2_data_make_free( * Remove entry 1 first then entry 0. */ ASSERT(dfp && dfp2); - if (dfp == &d->hdr.bestfree[1]) { - dfp = &d->hdr.bestfree[0]; + if (dfp == &bf[1]) { + dfp = &bf[0]; ASSERT(dfp2 == dfp); - dfp2 = &d->hdr.bestfree[1]; + dfp2 = &bf[1]; } - xfs_dir2_data_freeremove(d, dfp2, needlogp); - xfs_dir2_data_freeremove(d, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); /* * Now insert the new entry. */ - dfp = xfs_dir2_data_freeinsert(d, prevdup, needlogp); - ASSERT(dfp == &d->hdr.bestfree[0]); + dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup, + needlogp); + ASSERT(dfp == &bf[0]); ASSERT(dfp->length == prevdup->length); ASSERT(!dfp[1].length); ASSERT(!dfp[2].length); @@ -622,67 +830,67 @@ xfs_dir2_data_make_free( * The entry before us is free, merge with it. */ else if (prevdup) { - dfp = xfs_dir2_data_freefind(d, prevdup); - be16_add(&prevdup->length, len); - *XFS_DIR2_DATA_UNUSED_TAG_P(prevdup) = - cpu_to_be16((char *)prevdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, prevdup); + dfp = xfs_dir2_data_freefind(hdr, bf, prevdup); + be16_add_cpu(&prevdup->length, len); + *xfs_dir2_data_unused_tag_p(prevdup) = + cpu_to_be16((char *)prevdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, prevdup); /* * If the previous entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, prevdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(prevdup->length) > - be16_to_cpu(d->hdr.bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* * The following entry is free, merge with it. */ else if (postdup) { - dfp = xfs_dir2_data_freefind(d, postdup); - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + dfp = xfs_dir2_data_freefind(hdr, bf, postdup); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = - cpu_to_be16((char *)newdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If the following entry was in the table, the new entry * is longer, so it will be in the table too. Remove * the old one and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } /* * Otherwise we need a scan if the new entry is big enough. */ else { needscan = be16_to_cpu(newdup->length) > - be16_to_cpu(d->hdr.bestfree[2].length); + be16_to_cpu(bf[2].length); } } /* * Neither neighbor is free. Make a new entry. */ else { - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = - cpu_to_be16((char *)newdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup); - (void)xfs_dir2_data_freeinsert(d, newdup, needlogp); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp); } *needscanp = needscan; } @@ -692,15 +900,15 @@ xfs_dir2_data_make_free( */ void xfs_dir2_data_use_free( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* data block buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp, xfs_dir2_data_unused_t *dup, /* unused entry */ xfs_dir2_data_aoff_t offset, /* starting offset to use */ xfs_dir2_data_aoff_t len, /* length to use */ int *needlogp, /* out: need to log header */ int *needscanp) /* out: need regen bestfree */ { - xfs_dir2_data_t *d; /* data block */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_free_t *dfp; /* bestfree pointer */ int matchback; /* matches end of freespace */ int matchfront; /* matches start of freespace */ @@ -708,25 +916,29 @@ xfs_dir2_data_use_free( xfs_dir2_data_unused_t *newdup; /* new unused entry */ xfs_dir2_data_unused_t *newdup2; /* another new unused entry */ int oldlen; /* old unused entry's length */ + struct xfs_dir2_data_free *bf; - d = bp->data; - ASSERT(be32_to_cpu(d->hdr.magic) == XFS_DIR2_DATA_MAGIC || - be32_to_cpu(d->hdr.magic) == XFS_DIR2_BLOCK_MAGIC); + hdr = bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)); ASSERT(be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG); - ASSERT(offset >= (char *)dup - (char *)d); - ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)d); - ASSERT((char *)dup - (char *)d == be16_to_cpu(*XFS_DIR2_DATA_UNUSED_TAG_P(dup))); + ASSERT(offset >= (char *)dup - (char *)hdr); + ASSERT(offset + len <= (char *)dup + be16_to_cpu(dup->length) - (char *)hdr); + ASSERT((char *)dup - (char *)hdr == be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup))); /* * Look up the entry in the bestfree table. */ - dfp = xfs_dir2_data_freefind(d, dup); oldlen = be16_to_cpu(dup->length); - ASSERT(dfp || oldlen <= be16_to_cpu(d->hdr.bestfree[2].length)); + bf = args->dp->d_ops->data_bestfree_p(hdr); + dfp = xfs_dir2_data_freefind(hdr, bf, dup); + ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length)); /* * Check for alignment with front and back of the entry. */ - matchfront = (char *)dup - (char *)d == offset; - matchback = (char *)dup + oldlen - (char *)d == offset + len; + matchfront = (char *)dup - (char *)hdr == offset; + matchback = (char *)dup + oldlen - (char *)hdr == offset + len; ASSERT(*needscanp == 0); needscan = 0; /* @@ -735,9 +947,10 @@ xfs_dir2_data_use_free( */ if (matchfront && matchback) { if (dfp) { - needscan = (d->hdr.bestfree[2].offset != 0); + needscan = (bf[2].offset != 0); if (!needscan) - xfs_dir2_data_freeremove(d, dfp, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); } } /* @@ -745,27 +958,28 @@ xfs_dir2_data_use_free( * Make a new entry with the remaining freespace. */ else if (matchfront) { - newdup = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup->length = cpu_to_be16(oldlen - len); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = - cpu_to_be16((char *)newdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); /* * If we got inserted at the last slot, * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &d->hdr.bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -774,25 +988,26 @@ xfs_dir2_data_use_free( */ else if (matchback) { newdup = dup; - newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = - cpu_to_be16((char *)newdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup); + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); /* * If it was in the table, remove it and add the new one. */ if (dfp) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - dfp = xfs_dir2_data_freeinsert(d, newdup, needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp); + dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); ASSERT(dfp != NULL); ASSERT(dfp->length == newdup->length); - ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)d); + ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr); /* * If we got inserted at the last slot, * that means we don't know if there was a better * choice for the last slot, or not. Rescan. */ - needscan = dfp == &d->hdr.bestfree[2]; + needscan = dfp == &bf[2]; } } /* @@ -801,16 +1016,16 @@ xfs_dir2_data_use_free( */ else { newdup = dup; - newdup->length = cpu_to_be16(((char *)d + offset) - (char *)newdup); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup) = - cpu_to_be16((char *)newdup - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup); - newdup2 = (xfs_dir2_data_unused_t *)((char *)d + offset + len); + newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup); + *xfs_dir2_data_unused_tag_p(newdup) = + cpu_to_be16((char *)newdup - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup); + newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len); newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG); newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length)); - *XFS_DIR2_DATA_UNUSED_TAG_P(newdup2) = - cpu_to_be16((char *)newdup2 - (char *)d); - xfs_dir2_data_log_unused(tp, bp, newdup2); + *xfs_dir2_data_unused_tag_p(newdup2) = + cpu_to_be16((char *)newdup2 - (char *)hdr); + xfs_dir2_data_log_unused(args, bp, newdup2); /* * If the old entry was in the table, we need to scan * if the 3rd entry was valid, since these entries @@ -820,13 +1035,14 @@ xfs_dir2_data_use_free( * the 2 new will work. */ if (dfp) { - needscan = (d->hdr.bestfree[2].length != 0); + needscan = (bf[2].length != 0); if (!needscan) { - xfs_dir2_data_freeremove(d, dfp, needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup, - needlogp); - (void)xfs_dir2_data_freeinsert(d, newdup2, - needlogp); + xfs_dir2_data_freeremove(hdr, bf, dfp, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup, + needlogp); + xfs_dir2_data_freeinsert(hdr, bf, newdup2, + needlogp); } } } diff --git a/fs/xfs/xfs_dir2_data.h b/fs/xfs/xfs_dir2_data.h deleted file mode 100644 index a6ae2d21c40..00000000000 --- a/fs/xfs/xfs_dir2_data.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_DATA_H__ -#define __XFS_DIR2_DATA_H__ - -/* - * Directory format 2, data block structures. - */ - -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_inode; -struct xfs_trans; - -/* - * Constants. - */ -#define XFS_DIR2_DATA_MAGIC 0x58443244 /* XD2D: for multiblock dirs */ -#define XFS_DIR2_DATA_ALIGN_LOG 3 /* i.e., 8 bytes */ -#define XFS_DIR2_DATA_ALIGN (1 << XFS_DIR2_DATA_ALIGN_LOG) -#define XFS_DIR2_DATA_FREE_TAG 0xffff -#define XFS_DIR2_DATA_FD_COUNT 3 - -/* - * Directory address space divided into sections, - * spaces separated by 32gb. - */ -#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG)) -#define XFS_DIR2_DATA_SPACE 0 -#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_DATA_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATA_OFFSET) - -/* - * Offsets of . and .. in data space (always block 0) - */ -#define XFS_DIR2_DATA_DOT_OFFSET \ - ((xfs_dir2_data_aoff_t)sizeof(xfs_dir2_data_hdr_t)) -#define XFS_DIR2_DATA_DOTDOT_OFFSET \ - (XFS_DIR2_DATA_DOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(1)) -#define XFS_DIR2_DATA_FIRST_OFFSET \ - (XFS_DIR2_DATA_DOTDOT_OFFSET + XFS_DIR2_DATA_ENTSIZE(2)) - -/* - * Structures. - */ - -/* - * Describe a free area in the data block. - * The freespace will be formatted as a xfs_dir2_data_unused_t. - */ -typedef struct xfs_dir2_data_free { - __be16 offset; /* start of freespace */ - __be16 length; /* length of freespace */ -} xfs_dir2_data_free_t; - -/* - * Header for the data blocks. - * Always at the beginning of a directory-sized block. - * The code knows that XFS_DIR2_DATA_FD_COUNT is 3. - */ -typedef struct xfs_dir2_data_hdr { - __be32 magic; /* XFS_DIR2_DATA_MAGIC */ - /* or XFS_DIR2_BLOCK_MAGIC */ - xfs_dir2_data_free_t bestfree[XFS_DIR2_DATA_FD_COUNT]; -} xfs_dir2_data_hdr_t; - -/* - * Active entry in a data block. Aligned to 8 bytes. - * Tag appears as the last 2 bytes. - */ -typedef struct xfs_dir2_data_entry { - __be64 inumber; /* inode number */ - __u8 namelen; /* name length */ - __u8 name[1]; /* name bytes, no null */ - /* variable offset */ - __be16 tag; /* starting offset of us */ -} xfs_dir2_data_entry_t; - -/* - * Unused entry in a data block. Aligned to 8 bytes. - * Tag appears as the last 2 bytes. - */ -typedef struct xfs_dir2_data_unused { - __be16 freetag; /* XFS_DIR2_DATA_FREE_TAG */ - __be16 length; /* total free length */ - /* variable offset */ - __be16 tag; /* starting offset of us */ -} xfs_dir2_data_unused_t; - -typedef union { - xfs_dir2_data_entry_t entry; - xfs_dir2_data_unused_t unused; -} xfs_dir2_data_union_t; - -/* - * Generic data block structure, for xfs_db. - */ -typedef struct xfs_dir2_data { - xfs_dir2_data_hdr_t hdr; /* magic XFS_DIR2_DATA_MAGIC */ - xfs_dir2_data_union_t u[1]; -} xfs_dir2_data_t; - -/* - * Macros. - */ - -/* - * Size of a data entry. - */ -#define XFS_DIR2_DATA_ENTSIZE(n) xfs_dir2_data_entsize(n) -static inline int xfs_dir2_data_entsize(int n) -{ - return (int)roundup(offsetof(xfs_dir2_data_entry_t, name[0]) + (n) + \ - (uint)sizeof(xfs_dir2_data_off_t), XFS_DIR2_DATA_ALIGN); -} - -/* - * Pointer to an entry's tag word. - */ -#define XFS_DIR2_DATA_ENTRY_TAG_P(dep) xfs_dir2_data_entry_tag_p(dep) -static inline __be16 * -xfs_dir2_data_entry_tag_p(xfs_dir2_data_entry_t *dep) -{ - return (__be16 *)((char *)dep + - XFS_DIR2_DATA_ENTSIZE(dep->namelen) - sizeof(__be16)); -} - -/* - * Pointer to a freespace's tag word. - */ -#define XFS_DIR2_DATA_UNUSED_TAG_P(dup) \ - xfs_dir2_data_unused_tag_p(dup) -static inline __be16 * -xfs_dir2_data_unused_tag_p(xfs_dir2_data_unused_t *dup) -{ - return (__be16 *)((char *)dup + - be16_to_cpu(dup->length) - sizeof(__be16)); -} - -/* - * Function declarations. - */ -#ifdef DEBUG -extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_dabuf *bp); -#else -#define xfs_dir2_data_check(dp,bp) -#endif -extern xfs_dir2_data_free_t *xfs_dir2_data_freefind(xfs_dir2_data_t *d, - xfs_dir2_data_unused_t *dup); -extern xfs_dir2_data_free_t *xfs_dir2_data_freeinsert(xfs_dir2_data_t *d, - xfs_dir2_data_unused_t *dup, int *loghead); -extern void xfs_dir2_data_freescan(struct xfs_mount *mp, xfs_dir2_data_t *d, - int *loghead, char *aendp); -extern int xfs_dir2_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, - struct xfs_dabuf **bpp); -extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_entry_t *dep); -extern void xfs_dir2_data_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_unused_t *dup); -extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, - int *needscanp); -extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_dabuf *bp, - xfs_dir2_data_unused_t *dup, - xfs_dir2_data_aoff_t offset, - xfs_dir2_data_aoff_t len, int *needlogp, - int *needscanp); - -#endif /* __XFS_DIR2_DATA_H__ */ diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index b1cf1fbf423..fb0aad4440c 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,44 +18,352 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Local function declarations. */ +static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp, + int *indexp, struct xfs_buf **dbpp); +static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args, + struct xfs_buf *bp); + +/* + * Check the internal consistency of a leaf1 block. + * Pop an assert if something is wrong. + */ #ifdef DEBUG -static void xfs_dir2_leaf_check(xfs_inode_t *dp, xfs_dabuf_t *bp); +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leaf1_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +STATIC bool +xfs_dir3_leaf1_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} #else -#define xfs_dir2_leaf_check(dp, bp) +#define xfs_dir3_leaf_check(dp, bp) #endif -static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, xfs_dabuf_t **lbpp, - int *indexp, xfs_dabuf_t **dbpp); -static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); -static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); +bool +xfs_dir3_leaf_check_int( + struct xfs_mount *mp, + struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, + struct xfs_dir2_leaf *leaf) +{ + struct xfs_dir2_leaf_entry *ents; + xfs_dir2_leaf_tail_t *ltp; + int stale; + int i; + const struct xfs_dir_ops *ops; + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_da_geometry *geo = mp->m_dir_geo; + + /* + * we can be passed a null dp here from a verifier, so we need to go the + * hard way to get them. + */ + ops = xfs_dir_get_ops(mp, dp); + + if (!hdr) { + ops->leaf_hdr_from_disk(&leafhdr, leaf); + hdr = &leafhdr; + } + + ents = ops->leaf_ents_p(leaf); + ltp = xfs_dir2_leaf_tail_p(geo, leaf); + + /* + * XXX (dgc): This value is not restrictive enough. + * Should factor in the size of the bests table as well. + * We can deduce a value for that from di_size. + */ + if (hdr->count > ops->leaf_max_ents(geo)) + return false; + + /* Leaves and bests don't overlap in leaf format. */ + if ((hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR3_LEAF1_MAGIC) && + (char *)&ents[hdr->count] > (char *)xfs_dir2_leaf_bests_p(ltp)) + return false; + + /* Check hash value order, count stale entries. */ + for (i = stale = 0; i < hdr->count; i++) { + if (i + 1 < hdr->count) { + if (be32_to_cpu(ents[i].hashval) > + be32_to_cpu(ents[i + 1].hashval)) + return false; + } + if (ents[i].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + stale++; + } + if (hdr->stale != stale) + return false; + return true; +} + +/* + * We verify the magic numbers before decoding the leaf header so that on debug + * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due + * to incorrect magic numbers. + */ +static bool +xfs_dir3_leaf_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + __uint16_t magic3; + + magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC + : XFS_DIR3_LEAFN_MAGIC; + + if (leaf3->info.hdr.magic != cpu_to_be16(magic3)) + return false; + if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else { + if (leaf->hdr.info.magic != cpu_to_be16(magic)) + return false; + } + + return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf); +} + +static void +__read_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_leaf_verify(bp, magic)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +__write_verify( + struct xfs_buf *bp, + __uint16_t magic) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_leaf_verify(bp, magic)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF); +} + +static void +xfs_dir3_leaf1_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leaf1_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAF1_MAGIC); +} + +static void +xfs_dir3_leafn_read_verify( + struct xfs_buf *bp) +{ + __read_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +static void +xfs_dir3_leafn_write_verify( + struct xfs_buf *bp) +{ + __write_verify(bp, XFS_DIR2_LEAFN_MAGIC); +} + +const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .verify_read = xfs_dir3_leaf1_read_verify, + .verify_write = xfs_dir3_leaf1_write_verify, +}; + +const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .verify_read = xfs_dir3_leafn_read_verify, + .verify_write = xfs_dir3_leafn_write_verify, +}; + +static int +xfs_dir3_leaf_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leaf1_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAF1_BUF); + return err; +} + +int +xfs_dir3_leafn_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_leafn_buf_ops); + if (!err && tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_LEAFN_BUF); + return err; +} + +/* + * Initialize a new leaf block, leaf1 or leafn magic accepted. + */ +static void +xfs_dir3_leaf_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *bp, + xfs_ino_t owner, + __uint16_t type) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + + memset(leaf3, 0, sizeof(*leaf3)); + + leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) + ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) + : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.owner = cpu_to_be64(owner); + uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_uuid); + } else { + memset(leaf, 0, sizeof(*leaf)); + leaf->hdr.info.magic = cpu_to_be16(type); + } + + /* + * If it's a leaf-format directory initialize the tail. + * Caller is responsible for initialising the bests table. + */ + if (type == XFS_DIR2_LEAF1_MAGIC) { + struct xfs_dir2_leaf_tail *ltp; + + ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf); + ltp->bestcount = 0; + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF); + } else { + bp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF); + } +} + +int +xfs_dir3_leaf_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t bno, + struct xfs_buf **bpp, + __uint16_t magic) +{ + struct xfs_inode *dp = args->dp; + struct xfs_trans *tp = args->trans; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + + ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC); + ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) && + bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic); + xfs_dir3_leaf_log_header(args, bp); + if (magic == XFS_DIR2_LEAF1_MAGIC) + xfs_dir3_leaf_log_tail(args, bp); + *bpp = bp; + return 0; +} /* * Convert a block form directory to a leaf form directory. @@ -62,16 +371,16 @@ static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_dabuf *bp); int /* error */ xfs_dir2_block_to_leaf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *dbp) /* input block's buffer */ + struct xfs_buf *dbp) /* input block's buffer */ { __be16 *bestsp; /* leaf's bestsp entries */ xfs_dablk_t blkno; /* leaf block's bno */ - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_leaf_entry_t *blp; /* block's leaf entries */ xfs_dir2_block_tail_t *btp; /* block's tail */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *lbp; /* leaf block's buffer */ + struct xfs_buf *lbp; /* leaf block's buffer */ xfs_dir2_db_t ldb; /* leaf block's bno */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_tail_t *ltp; /* leaf's tail */ @@ -79,8 +388,12 @@ xfs_dir2_block_to_leaf( int needlog; /* need to log block header */ int needscan; /* need to rescan bestfree */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_block_to_leaf(args); - xfs_dir2_trace_args_b("block_to_leaf", args, dbp); dp = args->dp; mp = dp->i_mount; tp = args->trans; @@ -92,68 +405,200 @@ xfs_dir2_block_to_leaf( if ((error = xfs_da_grow_inode(args, &blkno))) { return error; } - ldb = XFS_DIR2_DA_TO_DB(mp, blkno); - ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp)); + ldb = xfs_dir2_da_to_db(args->geo, blkno); + ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET)); /* * Initialize the leaf block, get a buffer for it. */ - if ((error = xfs_dir2_leaf_init(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC))) { + error = xfs_dir3_leaf_get_buf(args, ldb, &lbp, XFS_DIR2_LEAF1_MAGIC); + if (error) return error; - } - ASSERT(lbp != NULL); - leaf = lbp->data; - block = dbp->data; - xfs_dir2_data_check(dp, dbp); - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); + bf = dp->d_ops->data_bestfree_p(hdr); + ents = dp->d_ops->leaf_ents_p(leaf); + /* * Set the counts in the leaf header. */ - leaf->hdr.count = cpu_to_be16(be32_to_cpu(btp->count)); - leaf->hdr.stale = cpu_to_be16(be32_to_cpu(btp->stale)); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + leafhdr.count = be32_to_cpu(btp->count); + leafhdr.stale = be32_to_cpu(btp->stale); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + /* * Could compact these but I think we always do the conversion * after squeezing out stale entries. */ - memcpy(leaf->ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, lbp, 0, be16_to_cpu(leaf->hdr.count) - 1); + memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1); needscan = 0; needlog = 1; /* * Make the space formerly occupied by the leaf entries and block * tail be free. */ - xfs_dir2_data_make_free(tp, dbp, - (xfs_dir2_data_aoff_t)((char *)blp - (char *)block), - (xfs_dir2_data_aoff_t)((char *)block + mp->m_dirblksize - + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr), + (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize - (char *)blp), &needlog, &needscan); /* * Fix up the block header, make it a data block. */ - block->hdr.magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + dbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, dbp, XFS_BLFT_DIR_DATA_BUF); + if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) + hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC); + else + hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); + if (needscan) - xfs_dir2_data_freescan(mp, (xfs_dir2_data_t *)block, &needlog, - NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Set up leaf tail and bests table. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); ltp->bestcount = cpu_to_be32(1); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - bestsp[0] = block->hdr.bestfree[0].length; + bestsp = xfs_dir2_leaf_bests_p(ltp); + bestsp[0] = bf[0].length; /* * Log the data header and leaf bests table. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_dir2_data_check(dp, dbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, 0); - xfs_da_buf_done(lbp); + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, 0); return 0; } +STATIC void +xfs_dir3_leaf_find_stale( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, + int *lowstale, + int *highstale) +{ + /* + * Find the first stale entry before our index, if any. + */ + for (*lowstale = index - 1; *lowstale >= 0; --*lowstale) { + if (ents[*lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + } + + /* + * Find the first stale entry at or after our index, if any. + * Stop if the result would require moving more entries than using + * lowstale. + */ + for (*highstale = index; *highstale < leafhdr->count; ++*highstale) { + if (ents[*highstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) + break; + if (*lowstale >= 0 && index - *lowstale <= *highstale - index) + break; + } +} + +struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, + int index, /* leaf table position */ + int compact, /* need to compact leaves */ + int lowstale, /* index of prev stale leaf */ + int highstale, /* index of next stale leaf */ + int *lfloglow, /* low leaf logging index */ + int *lfloghigh) /* high leaf logging index */ +{ + if (!leafhdr->stale) { + xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ + + /* + * Now we need to make room to insert the leaf entry. + * + * If there are no stale entries, just insert a hole at index. + */ + lep = &ents[index]; + if (index < leafhdr->count) + memmove(lep + 1, lep, + (leafhdr->count - index) * sizeof(*lep)); + + /* + * Record low and high logging indices for the leaf. + */ + *lfloglow = index; + *lfloghigh = leafhdr->count++; + return lep; + } + + /* + * There are stale entries. + * + * We will use one of them for the new entry. It's probably not at + * the right location, so we'll have to shift some up or down first. + * + * If we didn't compact before, we need to find the nearest stale + * entries before and after our insertion point. + */ + if (compact == 0) + xfs_dir3_leaf_find_stale(leafhdr, ents, index, + &lowstale, &highstale); + + /* + * If the low one is better, use it. + */ + if (lowstale >= 0 && + (highstale == leafhdr->count || + index - lowstale - 1 < highstale - index)) { + ASSERT(index - lowstale - 1 >= 0); + ASSERT(ents[lowstale].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries up to cover the stale entry and make room + * for the new entry. + */ + if (index - lowstale - 1 > 0) { + memmove(&ents[lowstale], &ents[lowstale + 1], + (index - lowstale - 1) * + sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(lowstale, *lfloglow); + *lfloghigh = MAX(index - 1, *lfloghigh); + leafhdr->stale--; + return &ents[index - 1]; + } + + /* + * The high one is better, so use that one. + */ + ASSERT(highstale - index >= 0); + ASSERT(ents[highstale].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)); + + /* + * Copy entries down to cover the stale entry and make room for the + * new entry. + */ + if (highstale - index > 0) { + memmove(&ents[index + 1], &ents[index], + (highstale - index) * sizeof(xfs_dir2_leaf_entry_t)); + } + *lfloglow = MIN(index, *lfloglow); + *lfloghigh = MAX(highstale, *lfloghigh); + leafhdr->stale--; + return &ents[index]; +} + /* * Add an entry to a leaf form directory. */ @@ -163,8 +608,8 @@ xfs_dir2_leaf_addname( { __be16 *bestsp; /* freespace table in leaf */ int compact; /* need to compact leaves */ - xfs_dir2_data_t *data; /* data block structure */ - xfs_dabuf_t *dbp; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry */ @@ -173,7 +618,7 @@ xfs_dir2_leaf_addname( int highstale; /* index of next stale leaf */ int i; /* temporary, index */ int index; /* leaf table position */ - xfs_dabuf_t *lbp; /* leaf's buffer */ + struct xfs_buf *lbp; /* leaf's buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int length; /* length of new entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry table pointer */ @@ -188,20 +633,20 @@ xfs_dir2_leaf_addname( __be16 *tagp; /* end of data entry */ xfs_trans_t *tp; /* transaction pointer */ xfs_dir2_db_t use_block; /* data block number */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_addname(args); - xfs_dir2_trace_args("leaf_addname", args); dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block. - */ - error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK); - if (error) { + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) return error; - } - ASSERT(lbp != NULL); + /* * Look up the entry by hash value and name. * We know it's not there, our caller has already done a lookup. @@ -209,24 +654,27 @@ xfs_dir2_leaf_addname( * But if there are dup hash values the index is of the first of those. */ index = xfs_dir2_leaf_search_hash(args, lbp); - leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); + length = dp->d_ops->data_entsize(args->namelen); + /* * See if there are any entries with the same hash value * and space in their block for the new entry. * This is good because it puts multiple same-hash value entries * in a data block, improving the lookup of those entries. */ - for (use_block = -1, lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (use_block = -1, lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; index++, lep++) { if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) continue; - i = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(i < be32_to_cpu(ltp->bestcount)); - ASSERT(be16_to_cpu(bestsp[i]) != NULLDATAOFF); + ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF)); if (be16_to_cpu(bestsp[i]) >= length) { use_block = i; break; @@ -240,7 +688,8 @@ xfs_dir2_leaf_addname( /* * Remember a block we see that's missing. */ - if (be16_to_cpu(bestsp[i]) == NULLDATAOFF && use_block == -1) + if (bestsp[i] == cpu_to_be16(NULLDATAOFF) && + use_block == -1) use_block = i; else if (be16_to_cpu(bestsp[i]) >= length) { use_block = i; @@ -251,41 +700,43 @@ xfs_dir2_leaf_addname( /* * How many bytes do we need in the leaf block? */ - needbytes = - (leaf->hdr.stale ? 0 : (uint)sizeof(leaf->ents[0])) + - (use_block != -1 ? 0 : (uint)sizeof(leaf->bests[0])); + needbytes = 0; + if (!leafhdr.stale) + needbytes += sizeof(xfs_dir2_leaf_entry_t); + if (use_block == -1) + needbytes += sizeof(xfs_dir2_data_off_t); + /* * Now kill use_block if it refers to a missing block, so we * can use it as an indication of allocation needed. */ - if (use_block != -1 && be16_to_cpu(bestsp[use_block]) == NULLDATAOFF) + if (use_block != -1 && bestsp[use_block] == cpu_to_be16(NULLDATAOFF)) use_block = -1; /* * If we don't have enough free bytes but we can make enough * by compacting out stale entries, we'll do that. */ - if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes && - be16_to_cpu(leaf->hdr.stale) > 1) { + if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes && + leafhdr.stale > 1) compact = 1; - } + /* * Otherwise if we don't have enough free bytes we need to * convert to node form. */ - else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < - needbytes) { + else if ((char *)bestsp - (char *)&ents[leafhdr.count] < needbytes) { /* * Just checking or no space reservation, give up. */ - if (args->justcheck || args->total == 0) { - xfs_da_brelse(tp, lbp); + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || + args->total == 0) { + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* * Convert to node form. */ error = xfs_dir2_leaf_to_node(args, lbp); - xfs_da_buf_done(lbp); if (error) return error; /* @@ -302,8 +753,8 @@ xfs_dir2_leaf_addname( * If just checking, then it will fit unless we needed to allocate * a new data block. */ - if (args->justcheck) { - xfs_da_brelse(tp, lbp); + if (args->op_flags & XFS_DA_OP_JUSTCHECK) { + xfs_trans_brelse(tp, lbp); return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; } /* @@ -311,7 +762,7 @@ xfs_dir2_leaf_addname( * changed anything. */ if (args->total == 0 && use_block == -1) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOSPC); } /* @@ -321,15 +772,15 @@ xfs_dir2_leaf_addname( * point later. */ if (compact) { - xfs_dir2_leaf_compact_x1(lbp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); } /* * There are stale entries, so we'll need log-low and log-high * impossibly bad values later. */ - else if (be16_to_cpu(leaf->hdr.stale)) { - lfloglow = be16_to_cpu(leaf->hdr.count); + else if (leafhdr.stale) { + lfloglow = leafhdr.count; lfloghigh = -1; } /* @@ -342,14 +793,14 @@ xfs_dir2_leaf_addname( */ if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &use_block))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, lbp); return error; } /* * Initialize the block. */ - if ((error = xfs_dir2_data_init(args, use_block, &dbp))) { - xfs_da_brelse(tp, lbp); + if ((error = xfs_dir3_data_init(args, use_block, &dbp))) { + xfs_trans_brelse(tp, lbp); return error; } /* @@ -360,46 +811,48 @@ xfs_dir2_leaf_addname( bestsp--; memmove(&bestsp[0], &bestsp[1], be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0])); - be32_add(<p->bestcount, 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + be32_add_cpu(<p->bestcount, 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } /* * If we're filling in a previously empty block just log it. */ else - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - data = dbp->data; - bestsp[use_block] = data->hdr.bestfree[0].length; + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bestsp[use_block] = bf[0].length; grown = 1; - } - /* - * Already had space in some data block. - * Just read that one in. - */ - else { - if ((error = - xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, use_block), - -1, &dbp, XFS_DATA_FORK))) { - xfs_da_brelse(tp, lbp); + } else { + /* + * Already had space in some data block. + * Just read that one in. + */ + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, use_block), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); return error; } - data = dbp->data; + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); grown = 0; } - xfs_dir2_data_check(dp, dbp); /* * Point to the biggest freespace in our data block. */ dup = (xfs_dir2_data_unused_t *) - ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); ASSERT(be16_to_cpu(dup->length) >= length); needscan = needlog = 0; /* * Mark the initial part of our freespace in use for the new entry. */ - xfs_dir2_data_use_free(tp, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* * Initialize our new entry (at last). @@ -408,210 +861,78 @@ xfs_dir2_leaf_addname( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)data); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); /* * Need to scan fix up the bestfree table. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Need to log the data block's header. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_log_entry(tp, dbp, dep); + xfs_dir2_data_log_header(args, dbp); + xfs_dir2_data_log_entry(args, dbp, dep); /* * If the bests table needs to be changed, do it. * Log the change unless we've already done that. */ - if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(data->hdr.bestfree[0].length)) { - bestsp[use_block] = data->hdr.bestfree[0].length; + if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) { + bestsp[use_block] = bf[0].length; if (!grown) - xfs_dir2_leaf_log_bests(tp, lbp, use_block, use_block); - } - /* - * Now we need to make room to insert the leaf entry. - * If there are no stale entries, we just insert a hole at index. - */ - if (!leaf->hdr.stale) { - /* - * lep is still good as the index leaf entry. - */ - if (index < be16_to_cpu(leaf->hdr.count)) - memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); - /* - * Record low and high logging indices for the leaf. - */ - lfloglow = index; - lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add(&leaf->hdr.count, 1); - } - /* - * There are stale entries. - * We will use one of them for the new entry. - * It's probably not at the right location, so we'll have to - * shift some up or down first. - */ - else { - /* - * If we didn't compact before, we need to find the nearest - * stale entries before and after our insertion point. - */ - if (compact == 0) { - /* - * Find the first stale entry before the insertion - * point, if any. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != - XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find the next stale entry at or after the insertion - * point, if any. Stop if we go so far that the - * lowstale entry would be better. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != - XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || - index - lowstale - 1 >= highstale - index); - highstale++) - continue; - } - /* - * If the low one is better, use it. - */ - if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || - index - lowstale - 1 < highstale - index)) { - ASSERT(index - lowstale - 1 >= 0); - ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == - XFS_DIR2_NULL_DATAPTR); - /* - * Copy entries up to cover the stale entry - * and make room for the new entry. - */ - if (index - lowstale - 1 > 0) - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], - (index - lowstale - 1) * sizeof(*lep)); - lep = &leaf->ents[index - 1]; - lfloglow = MIN(lowstale, lfloglow); - lfloghigh = MAX(index - 1, lfloghigh); - } - /* - * The high one is better, so use that one. - */ - else { - ASSERT(highstale - index >= 0); - ASSERT(be32_to_cpu(leaf->ents[highstale].address) == - XFS_DIR2_NULL_DATAPTR); - /* - * Copy entries down to cover the stale entry - * and make room for the new entry. - */ - if (highstale - index > 0) - memmove(&leaf->ents[index + 1], - &leaf->ents[index], - (highstale - index) * sizeof(*lep)); - lep = &leaf->ents[index]; - lfloglow = MIN(index, lfloglow); - lfloghigh = MAX(highstale, lfloghigh); - } - be16_add(&leaf->hdr.stale, -1); + xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block); } + + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + /* * Fill in the new leaf entry. */ lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, use_block, + lep->address = cpu_to_be32( + xfs_dir2_db_off_to_dataptr(args->geo, use_block, be16_to_cpu(*tagp))); /* * Log the leaf fields and give up the buffers. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_leaf_log_ents(tp, lbp, lfloglow, lfloghigh); - xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); - xfs_dir2_data_check(dp, dbp); - xfs_da_buf_done(dbp); + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, lbp); + xfs_dir3_data_check(dp, dbp); return 0; } -#ifdef DEBUG -/* - * Check the internal consistency of a leaf1 block. - * Pop an assert if something is wrong. - */ -void -xfs_dir2_leaf_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf's buffer */ -{ - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail pointer */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ - - leaf = bp->data; - mp = dp->i_mount; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - /* - * This value is not restrictive enough. - * Should factor in the size of the bests table as well. - * We can deduce a value for that from di_size. - */ - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - /* - * Leaves and bests don't overlap. - */ - ASSERT((char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <= - (char *)XFS_DIR2_LEAF_BESTS_P(ltp)); - /* - * Check hash value order, count stale entries. - */ - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); -} -#endif /* DEBUG */ - /* * Compact out any stale entries in the leaf. * Log the header and changed leaf entries, if any. */ void -xfs_dir2_leaf_compact( +xfs_dir3_leaf_compact( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp) /* leaf buffer */ + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_buf *bp) /* leaf buffer */ { int from; /* source leaf index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int loglow; /* first leaf entry to log */ int to; /* target leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = args->dp; - leaf = bp->data; - if (!leaf->hdr.stale) { + leaf = bp->b_addr; + if (!leafhdr->stale) return; - } + /* * Compress out the stale entries in place. */ - for (from = to = 0, loglow = -1; from < be16_to_cpu(leaf->hdr.count); from++) { - if (be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) + ents = dp->d_ops->leaf_ents_p(leaf); + for (from = to = 0, loglow = -1; from < leafhdr->count; from++) { + if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) continue; /* * Only actually copy the entries that are different. @@ -619,19 +940,21 @@ xfs_dir2_leaf_compact( if (from > to) { if (loglow == -1) loglow = to; - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; } to++; } /* * Update and log the header, log the leaf entries. */ - ASSERT(be16_to_cpu(leaf->hdr.stale) == from - to); - be16_add(&leaf->hdr.count, -(be16_to_cpu(leaf->hdr.stale))); - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(args->trans, bp); + ASSERT(leafhdr->stale == from - to); + leafhdr->count -= leafhdr->stale; + leafhdr->stale = 0; + + dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr); + xfs_dir3_leaf_log_header(args, bp); if (loglow != -1) - xfs_dir2_leaf_log_ents(args->trans, bp, loglow, to - 1); + xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1); } /* @@ -643,8 +966,9 @@ xfs_dir2_leaf_compact( * and leaf logging indices. */ void -xfs_dir2_leaf_compact_x1( - xfs_dabuf_t *bp, /* leaf buffer */ +xfs_dir3_leaf_compact_x1( + struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, /* insertion index */ int *lowstalep, /* out: stale entry before us */ int *highstalep, /* out: stale entry after us */ @@ -655,37 +979,20 @@ xfs_dir2_leaf_compact_x1( int highstale; /* stale entry at/after index */ int index; /* insertion index */ int keepstale; /* source index of kept stale */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ int lowstale; /* stale entry before index */ int newindex=0; /* new insertion index */ int to; /* destination copy index */ - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.stale) > 1); + ASSERT(leafhdr->stale > 1); index = *indexp; - /* - * Find the first stale entry before our index, if any. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find the first stale entry at or after our index, if any. - * Stop if the answer would be worse than lowstale. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || index - lowstale > highstale - index); - highstale++) - continue; + + xfs_dir3_leaf_find_stale(leafhdr, ents, index, &lowstale, &highstale); + /* * Pick the better of lowstale and highstale. */ if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || + (highstale == leafhdr->count || index - lowstale <= highstale - index)) keepstale = lowstale; else @@ -694,14 +1001,14 @@ xfs_dir2_leaf_compact_x1( * Copy the entries in place, removing all the stale entries * except keepstale. */ - for (from = to = 0; from < be16_to_cpu(leaf->hdr.count); from++) { + for (from = to = 0; from < leafhdr->count; from++) { /* * Notice the new value of index. */ if (index == from) newindex = to; if (from != keepstale && - be32_to_cpu(leaf->ents[from].address) == XFS_DIR2_NULL_DATAPTR) { + ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) { if (from == to) *lowlogp = to; continue; @@ -715,7 +1022,7 @@ xfs_dir2_leaf_compact_x1( * Copy only the entries that have moved. */ if (from > to) - leaf->ents[to] = leaf->ents[from]; + ents[to] = ents[from]; to++; } ASSERT(from > to); @@ -729,8 +1036,8 @@ xfs_dir2_leaf_compact_x1( /* * Adjust the leaf header values. */ - be16_add(&leaf->hdr.count, -(from - to)); - leaf->hdr.stale = cpu_to_be16(1); + leafhdr->count -= from - to; + leafhdr->stale = 1; /* * Remember the low/high stale value only in the "right" * direction. @@ -738,479 +1045,35 @@ xfs_dir2_leaf_compact_x1( if (lowstale >= newindex) lowstale = -1; else - highstale = be16_to_cpu(leaf->hdr.count); - *highlogp = be16_to_cpu(leaf->hdr.count) - 1; + highstale = leafhdr->count; + *highlogp = leafhdr->count - 1; *lowstalep = lowstale; *highstalep = highstale; } /* - * Getdents (readdir) for leaf and node directories. - * This reads the data blocks only, so is the same for both forms. - */ -int /* error */ -xfs_dir2_leaf_getdents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *dp, /* incore directory inode */ - uio_t *uio, /* I/O control & vectors */ - int *eofp, /* out: reached end of dir */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* ABI formatting routine */ -{ - xfs_dabuf_t *bp; /* data block buffer */ - int byteoff; /* offset in current block */ - xfs_dir2_db_t curdb; /* db for current block */ - xfs_dir2_off_t curoff; /* current overall offset */ - xfs_dir2_data_t *data; /* data block structure */ - xfs_dir2_data_entry_t *dep; /* data entry */ - xfs_dir2_data_unused_t *dup; /* unused entry */ - int eof; /* reached end of directory */ - int error = 0; /* error return value */ - int i; /* temporary loop index */ - int j; /* temporary loop index */ - int length; /* temporary length value */ - xfs_bmbt_irec_t *map; /* map vector for blocks */ - xfs_extlen_t map_blocks; /* number of fsbs in map */ - xfs_dablk_t map_off; /* last mapped file offset */ - int map_size; /* total entries in *map */ - int map_valid; /* valid entries in *map */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_off_t newoff; /* new curoff after new blk */ - int nmap; /* mappings to ask xfs_bmapi */ - xfs_dir2_put_args_t *p; /* formatting arg bundle */ - char *ptr = NULL; /* pointer to current data */ - int ra_current; /* number of read-ahead blks */ - int ra_index; /* *map index for read-ahead */ - int ra_offset; /* map entry offset for ra */ - int ra_want; /* readahead count wanted */ - - /* - * If the offset is at or past the largest allowed value, - * give up right away, return eof. - */ - if (uio->uio_offset >= XFS_DIR2_MAX_DATAPTR) { - *eofp = 1; - return 0; - } - mp = dp->i_mount; - /* - * Setup formatting arguments. - */ - p = kmem_alloc(sizeof(*p), KM_SLEEP); - p->dbp = dbp; - p->put = put; - p->uio = uio; - /* - * Set up to bmap a number of blocks based on the caller's - * buffer size, the directory block size, and the filesystem - * block size. - */ - map_size = - howmany(uio->uio_resid + mp->m_dirblksize, - mp->m_sb.sb_blocksize); - map = kmem_alloc(map_size * sizeof(*map), KM_SLEEP); - map_valid = ra_index = ra_offset = ra_current = map_blocks = 0; - bp = NULL; - eof = 1; - /* - * Inside the loop we keep the main offset value as a byte offset - * in the directory file. - */ - curoff = XFS_DIR2_DATAPTR_TO_BYTE(mp, uio->uio_offset); - /* - * Force this conversion through db so we truncate the offset - * down to get the start of the data block. - */ - map_off = XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, curoff)); - /* - * Loop over directory entries until we reach the end offset. - * Get more blocks and readahead as necessary. - */ - while (curoff < XFS_DIR2_LEAF_OFFSET) { - /* - * If we have no buffer, or we're off the end of the - * current buffer, need to get another one. - */ - if (!bp || ptr >= (char *)bp->data + mp->m_dirblksize) { - /* - * If we have a buffer, we need to release it and - * take it out of the mapping. - */ - if (bp) { - xfs_da_brelse(tp, bp); - bp = NULL; - map_blocks -= mp->m_dirblkfsbs; - /* - * Loop to get rid of the extents for the - * directory block. - */ - for (i = mp->m_dirblkfsbs; i > 0; ) { - j = MIN((int)map->br_blockcount, i); - map->br_blockcount -= j; - map->br_startblock += j; - map->br_startoff += j; - /* - * If mapping is done, pitch it from - * the table. - */ - if (!map->br_blockcount && --map_valid) - memmove(&map[0], &map[1], - sizeof(map[0]) * - map_valid); - i -= j; - } - } - /* - * Recalculate the readahead blocks wanted. - */ - ra_want = howmany(uio->uio_resid + mp->m_dirblksize, - mp->m_sb.sb_blocksize) - 1; - /* - * If we don't have as many as we want, and we haven't - * run out of data blocks, get some more mappings. - */ - if (1 + ra_want > map_blocks && - map_off < - XFS_DIR2_BYTE_TO_DA(mp, XFS_DIR2_LEAF_OFFSET)) { - /* - * Get more bmaps, fill in after the ones - * we already have in the table. - */ - nmap = map_size - map_valid; - error = xfs_bmapi(tp, dp, - map_off, - XFS_DIR2_BYTE_TO_DA(mp, - XFS_DIR2_LEAF_OFFSET) - map_off, - XFS_BMAPI_METADATA, NULL, 0, - &map[map_valid], &nmap, NULL, NULL); - /* - * Don't know if we should ignore this or - * try to return an error. - * The trouble with returning errors - * is that readdir will just stop without - * actually passing the error through. - */ - if (error) - break; /* XXX */ - /* - * If we got all the mappings we asked for, - * set the final map offset based on the - * last bmap value received. - * Otherwise, we've reached the end. - */ - if (nmap == map_size - map_valid) - map_off = - map[map_valid + nmap - 1].br_startoff + - map[map_valid + nmap - 1].br_blockcount; - else - map_off = - XFS_DIR2_BYTE_TO_DA(mp, - XFS_DIR2_LEAF_OFFSET); - /* - * Look for holes in the mapping, and - * eliminate them. Count up the valid blocks. - */ - for (i = map_valid; i < map_valid + nmap; ) { - if (map[i].br_startblock == - HOLESTARTBLOCK) { - nmap--; - length = map_valid + nmap - i; - if (length) - memmove(&map[i], - &map[i + 1], - sizeof(map[i]) * - length); - } else { - map_blocks += - map[i].br_blockcount; - i++; - } - } - map_valid += nmap; - } - /* - * No valid mappings, so no more data blocks. - */ - if (!map_valid) { - curoff = XFS_DIR2_DA_TO_BYTE(mp, map_off); - break; - } - /* - * Read the directory block starting at the first - * mapping. - */ - curdb = XFS_DIR2_DA_TO_DB(mp, map->br_startoff); - error = xfs_da_read_buf(tp, dp, map->br_startoff, - map->br_blockcount >= mp->m_dirblkfsbs ? - XFS_FSB_TO_DADDR(mp, map->br_startblock) : - -1, - &bp, XFS_DATA_FORK); - /* - * Should just skip over the data block instead - * of giving up. - */ - if (error) - break; /* XXX */ - /* - * Adjust the current amount of read-ahead: we just - * read a block that was previously ra. - */ - if (ra_current) - ra_current -= mp->m_dirblkfsbs; - /* - * Do we need more readahead? - */ - for (ra_index = ra_offset = i = 0; - ra_want > ra_current && i < map_blocks; - i += mp->m_dirblkfsbs) { - ASSERT(ra_index < map_valid); - /* - * Read-ahead a contiguous directory block. - */ - if (i > ra_current && - map[ra_index].br_blockcount >= - mp->m_dirblkfsbs) { - xfs_baread(mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, - map[ra_index].br_startblock + - ra_offset), - (int)BTOBB(mp->m_dirblksize)); - ra_current = i; - } - /* - * Read-ahead a non-contiguous directory block. - * This doesn't use our mapping, but this - * is a very rare case. - */ - else if (i > ra_current) { - (void)xfs_da_reada_buf(tp, dp, - map[ra_index].br_startoff + - ra_offset, XFS_DATA_FORK); - ra_current = i; - } - /* - * Advance offset through the mapping table. - */ - for (j = 0; j < mp->m_dirblkfsbs; j++) { - /* - * The rest of this extent but not - * more than a dir block. - */ - length = MIN(mp->m_dirblkfsbs, - (int)(map[ra_index].br_blockcount - - ra_offset)); - j += length; - ra_offset += length; - /* - * Advance to the next mapping if - * this one is used up. - */ - if (ra_offset == - map[ra_index].br_blockcount) { - ra_offset = 0; - ra_index++; - } - } - } - /* - * Having done a read, we need to set a new offset. - */ - newoff = XFS_DIR2_DB_OFF_TO_BYTE(mp, curdb, 0); - /* - * Start of the current block. - */ - if (curoff < newoff) - curoff = newoff; - /* - * Make sure we're in the right block. - */ - else if (curoff > newoff) - ASSERT(XFS_DIR2_BYTE_TO_DB(mp, curoff) == - curdb); - data = bp->data; - xfs_dir2_data_check(dp, bp); - /* - * Find our position in the block. - */ - ptr = (char *)&data->u; - byteoff = XFS_DIR2_BYTE_TO_OFF(mp, curoff); - /* - * Skip past the header. - */ - if (byteoff == 0) - curoff += (uint)sizeof(data->hdr); - /* - * Skip past entries until we reach our offset. - */ - else { - while ((char *)ptr - (char *)data < byteoff) { - dup = (xfs_dir2_data_unused_t *)ptr; - - if (be16_to_cpu(dup->freetag) - == XFS_DIR2_DATA_FREE_TAG) { - - length = be16_to_cpu(dup->length); - ptr += length; - continue; - } - dep = (xfs_dir2_data_entry_t *)ptr; - length = - XFS_DIR2_DATA_ENTSIZE(dep->namelen); - ptr += length; - } - /* - * Now set our real offset. - */ - curoff = - XFS_DIR2_DB_OFF_TO_BYTE(mp, - XFS_DIR2_BYTE_TO_DB(mp, curoff), - (char *)ptr - (char *)data); - if (ptr >= (char *)data + mp->m_dirblksize) { - continue; - } - } - } - /* - * We have a pointer to an entry. - * Is it a live one? - */ - dup = (xfs_dir2_data_unused_t *)ptr; - /* - * No, it's unused, skip over it. - */ - if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - length = be16_to_cpu(dup->length); - ptr += length; - curoff += length; - continue; - } - - /* - * Copy the entry into the putargs, and try formatting it. - */ - dep = (xfs_dir2_data_entry_t *)ptr; - - p->namelen = dep->namelen; - - length = XFS_DIR2_DATA_ENTSIZE(p->namelen); - - p->cook = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff + length); - - p->ino = be64_to_cpu(dep->inumber); -#if XFS_BIG_INUMS - p->ino += mp->m_inoadd; -#endif - p->name = (char *)dep->name; - - error = p->put(p); - - /* - * Won't fit. Return to caller. - */ - if (!p->done) { - eof = 0; - break; - } - /* - * Advance to next entry in the block. - */ - ptr += length; - curoff += length; - } - - /* - * All done. Set output offset value to current offset. - */ - *eofp = eof; - if (curoff > XFS_DIR2_DATAPTR_TO_BYTE(mp, XFS_DIR2_MAX_DATAPTR)) - uio->uio_offset = XFS_DIR2_MAX_DATAPTR; - else - uio->uio_offset = XFS_DIR2_BYTE_TO_DATAPTR(mp, curoff); - kmem_free(map, map_size * sizeof(*map)); - kmem_free(p, sizeof(*p)); - if (bp) - xfs_da_brelse(tp, bp); - return error; -} - -/* - * Initialize a new leaf block, leaf1 or leafn magic accepted. - */ -int -xfs_dir2_leaf_init( - xfs_da_args_t *args, /* operation arguments */ - xfs_dir2_db_t bno, /* directory block number */ - xfs_dabuf_t **bpp, /* out: leaf buffer */ - int magic) /* magic number for block */ -{ - xfs_dabuf_t *bp; /* leaf buffer */ - xfs_inode_t *dp; /* incore directory inode */ - int error; /* error return code */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_trans_t *tp; /* transaction pointer */ - - dp = args->dp; - ASSERT(dp != NULL); - tp = args->trans; - mp = dp->i_mount; - ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) && - bno < XFS_DIR2_FREE_FIRSTDB(mp)); - /* - * Get the buffer for the block. - */ - error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, bno), -1, &bp, - XFS_DATA_FORK); - if (error) { - return error; - } - ASSERT(bp != NULL); - leaf = bp->data; - /* - * Initialize the header. - */ - leaf->hdr.info.magic = cpu_to_be16(magic); - leaf->hdr.info.forw = 0; - leaf->hdr.info.back = 0; - leaf->hdr.count = 0; - leaf->hdr.stale = 0; - xfs_dir2_leaf_log_header(tp, bp); - /* - * If it's a leaf-format directory initialize the tail. - * In this case our caller has the real bests table to copy into - * the block. - */ - if (magic == XFS_DIR2_LEAF1_MAGIC) { - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - ltp->bestcount = 0; - xfs_dir2_leaf_log_tail(tp, bp); - } - *bpp = bp; - return 0; -} - -/* * Log the bests entries indicated from a leaf1 block. */ static void -xfs_dir2_leaf_log_bests( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ +xfs_dir3_leaf_log_bests( + struct xfs_da_args *args, + struct xfs_buf *bp, /* leaf buffer */ int first, /* first entry to log */ int last) /* last entry to log */ { __be16 *firstb; /* pointer to first entry */ __be16 *lastb; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(tp->t_mountp, leaf); - firstb = XFS_DIR2_LEAF_BESTS_P(ltp) + first; - lastb = XFS_DIR2_LEAF_BESTS_P(ltp) + last; - xfs_da_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf), + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + firstb = xfs_dir2_leaf_bests_p(ltp) + first; + lastb = xfs_dir2_leaf_bests_p(ltp) + last; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstb - (char *)leaf), (uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1)); } @@ -1218,22 +1081,27 @@ xfs_dir2_leaf_log_bests( * Log the leaf entries indicated from a leaf1 or leafn block. */ void -xfs_dir2_leaf_log_ents( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* leaf buffer */ - int first, /* first entry to log */ - int last) /* last entry to log */ +xfs_dir3_leaf_log_ents( + struct xfs_da_args *args, + struct xfs_buf *bp, + int first, + int last) { xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */ xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || - be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - firstlep = &leaf->ents[first]; - lastlep = &leaf->ents[last]; - xfs_da_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf), + ents = args->dp->d_ops->leaf_ents_p(leaf); + firstlep = &ents[first]; + lastlep = &ents[last]; + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)firstlep - (char *)leaf), (uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1)); } @@ -1241,37 +1109,41 @@ xfs_dir2_leaf_log_ents( * Log the header of the leaf1 or leafn block. */ void -xfs_dir2_leaf_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ +xfs_dir3_leaf_log_header( + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC || - be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf), - (uint)(sizeof(leaf->hdr) - 1)); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&leaf->hdr - (char *)leaf), + args->dp->d_ops->leaf_hdr_size - 1); } /* * Log the tail of the leaf1 block. */ STATIC void -xfs_dir2_leaf_log_tail( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* leaf buffer */ +xfs_dir3_leaf_log_tail( + struct xfs_da_args *args, + struct xfs_buf *bp) { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ - xfs_mount_t *mp; /* filesystem mount point */ - mp = tp->t_mountp; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAF1_MAGIC); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - xfs_da_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf), - (uint)(mp->m_dirblksize - 1)); + ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) || + leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)); + + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf), + (uint)(args->geo->blksize - 1)); } /* @@ -1283,17 +1155,19 @@ int xfs_dir2_leaf_lookup( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* found entry index */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_lookup(args); - xfs_dir2_trace_args("leaf_lookup", args); /* * Look up name in the leaf block, returning both buffers and index. */ @@ -1302,25 +1176,29 @@ xfs_dir2_leaf_lookup( } tp = args->trans; dp = args->dp; - xfs_dir2_leaf_check(dp, lbp); - leaf = lbp->data; + xfs_dir3_leaf_check(dp, lbp); + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); /* * Get to the leaf entry and contained data entry address. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); /* - * Return the found inode number. + * Return the found inode number & CI name if appropriate */ args->inumber = be64_to_cpu(dep->inumber); - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); - return XFS_ERROR(EEXIST); + args->filetype = dp->d_ops->data_get_ftype(dep); + error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); + return XFS_ERROR(error); } /* @@ -1332,37 +1210,41 @@ xfs_dir2_leaf_lookup( static int /* error */ xfs_dir2_leaf_lookup_int( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t **lbpp, /* out: leaf buffer */ + struct xfs_buf **lbpp, /* out: leaf buffer */ int *indexp, /* out: index in leaf block */ - xfs_dabuf_t **dbpp) /* out: data buffer */ + struct xfs_buf **dbpp) /* out: data buffer */ { - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dabuf_t *dbp; /* data buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + struct xfs_buf *dbp = NULL; /* data buffer */ xfs_dir2_data_entry_t *dep; /* data entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index in leaf block */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ xfs_trans_t *tp; /* transaction pointer */ + xfs_dir2_db_t cidb = -1; /* case match data block no. */ + enum xfs_dacmp cmp; /* name compare result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - /* - * Read the leaf block into the buffer. - */ - if ((error = - xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp, - XFS_DATA_FORK))) { + + error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp); + if (error) return error; - } + *lbpp = lbp; - leaf = lbp->data; - xfs_dir2_leaf_check(dp, lbp); + leaf = lbp->b_addr; + xfs_dir3_leaf_check(dp, lbp); + ents = dp->d_ops->leaf_ents_p(leaf); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + /* * Look for the first leaf entry with our hash value. */ @@ -1371,8 +1253,8 @@ xfs_dir2_leaf_lookup_int( * Loop over all the entries with the right hash value * looking to match the name. */ - for (lep = &leaf->ents[index], dbp = NULL, curdb = -1; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* * Skip over stale leaf entries. @@ -1382,48 +1264,75 @@ xfs_dir2_leaf_lookup_int( /* * Get the new data block number. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * If it's not the same as the old data block number, * need to pitch the old one and read the new one. */ if (newdb != curdb) { if (dbp) - xfs_da_brelse(tp, dbp); - if ((error = - xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, &dbp, - XFS_DATA_FORK))) { - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, newdb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); return error; } - xfs_dir2_data_check(dp, dbp); curdb = newdb; } /* * Point to the data entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); /* - * If it matches then return it. + * Compare name and if it's an exact match, return the index + * and buffer. If it's the first case-insensitive match, store + * the index and buffer and continue looking for an exact match. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - *dbpp = dbp; + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; *indexp = index; - return 0; + /* case exact match: return the current buffer. */ + if (cmp == XFS_CMP_EXACT) { + *dbpp = dbp; + return 0; + } + cidb = curdb; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + /* + * Here, we can only be doing a lookup (not a rename or remove). + * If a case-insensitive match was found earlier, re-read the + * appropriate data block if required and return it. + */ + if (args->cmpresult == XFS_CMP_CASE) { + ASSERT(cidb != -1); + if (cidb != curdb) { + xfs_trans_brelse(tp, dbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, cidb), + -1, &dbp); + if (error) { + xfs_trans_brelse(tp, lbp); + return error; + } + } + *dbpp = dbp; + return 0; + } /* * No match found, return ENOENT. */ - ASSERT(args->oknoent); + ASSERT(cidb == -1); if (dbp) - xfs_da_brelse(tp, dbp); - xfs_da_brelse(tp, lbp); + xfs_trans_brelse(tp, dbp); + xfs_trans_brelse(tp, lbp); return XFS_ERROR(ENOENT); } @@ -1435,15 +1344,15 @@ xfs_dir2_leaf_removename( xfs_da_args_t *args) /* operation arguments */ { __be16 *bestsp; /* leaf block best freespace */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry structure */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_db_t i; /* temporary data block # */ int index; /* index into leaf entries */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */ @@ -1452,8 +1361,12 @@ xfs_dir2_leaf_removename( int needscan; /* need to rescan data frees */ xfs_dir2_data_off_t oldbest; /* old value of best free */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + trace_xfs_dir2_leaf_removename(args); - xfs_dir2_trace_args("leaf_removename", args); /* * Lookup the leaf entry, get the leaf and data blocks read in. */ @@ -1463,57 +1376,63 @@ xfs_dir2_leaf_removename( dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = lbp->data; - data = dbp->data; - xfs_dir2_data_check(dp, dbp); + leaf = lbp->b_addr; + hdr = dbp->b_addr; + xfs_dir3_data_check(dp, dbp); + bf = dp->d_ops->data_bestfree_p(hdr); + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, use that to point to the data entry. */ - lep = &leaf->ents[index]; - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); - dep = (xfs_dir2_data_entry_t *) - ((char *)data + XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + lep = &ents[index]; + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); needscan = needlog = 0; - oldbest = be16_to_cpu(data->hdr.bestfree[0].length); - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); + oldbest = be16_to_cpu(bf[0].length); + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + bestsp = xfs_dir2_leaf_bests_p(ltp); ASSERT(be16_to_cpu(bestsp[db]) == oldbest); /* * Mark the former data entry unused. */ - xfs_dir2_data_make_free(tp, dbp, - (xfs_dir2_data_aoff_t)((char *)dep - (char *)data), - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_make_free(args, dbp, + (xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr), + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * We just mark the leaf entry stale by putting a null in it. */ - be16_add(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, lbp); + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, lbp, index, index); + xfs_dir3_leaf_log_ents(args, lbp, index, index); + /* * Scan the freespace in the data block again if necessary, * log the data block header if necessary. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the longest freespace in the data block has changed, * put the new value in the bests table and log that. */ - if (be16_to_cpu(data->hdr.bestfree[0].length) != oldbest) { - bestsp[db] = data->hdr.bestfree[0].length; - xfs_dir2_leaf_log_bests(tp, lbp, db, db); + if (be16_to_cpu(bf[0].length) != oldbest) { + bestsp[db] = bf[0].length; + xfs_dir3_leaf_log_bests(args, lbp, db, db); } - xfs_dir2_data_check(dp, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the data block is now empty then get rid of the data block. */ - if (be16_to_cpu(data->hdr.bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(data->hdr)) { - ASSERT(db != mp->m_dirdatablk); + if (be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset) { + ASSERT(db != args->geo->datablk); if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { /* * Nope, can't get rid of it because it caused @@ -1521,12 +1440,9 @@ xfs_dir2_leaf_removename( * Just go on, returning success, leaving the * empty block in place. */ - if (error == ENOSPC && args->total == 0) { - xfs_da_buf_done(dbp); + if (error == ENOSPC && args->total == 0) error = 0; - } - xfs_dir2_leaf_check(dp, lbp); - xfs_da_buf_done(lbp); + xfs_dir3_leaf_check(dp, lbp); return error; } dbp = NULL; @@ -1539,7 +1455,7 @@ xfs_dir2_leaf_removename( * Look for the last active entry (i). */ for (i = db - 1; i > 0; i--) { - if (be16_to_cpu(bestsp[i]) != NULLDATAOFF) + if (bestsp[i] != cpu_to_be16(NULLDATAOFF)) break; } /* @@ -1548,20 +1464,20 @@ xfs_dir2_leaf_removename( */ memmove(&bestsp[db - i], bestsp, (be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp)); - be32_add(<p->bestcount, -(db - i)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + be32_add_cpu(<p->bestcount, -(db - i)); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, + be32_to_cpu(ltp->bestcount) - 1); } else bestsp[db] = cpu_to_be16(NULLDATAOFF); } /* * If the data block was not the first one, drop it. */ - else if (db != mp->m_dirdatablk && dbp != NULL) { - xfs_da_buf_done(dbp); + else if (db != args->geo->datablk) dbp = NULL; - } - xfs_dir2_leaf_check(dp, lbp); + + xfs_dir3_leaf_check(dp, lbp); /* * See if we can convert to block form. */ @@ -1575,17 +1491,19 @@ int /* error */ xfs_dir2_leaf_replace( xfs_da_args_t *args) /* operation arguments */ { - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ int index; /* index of leaf entry */ - xfs_dabuf_t *lbp; /* leaf buffer */ + struct xfs_buf *lbp; /* leaf buffer */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leaf_replace(args); - xfs_dir2_trace_args("leaf_replace", args); /* * Look up the entry. */ @@ -1593,27 +1511,28 @@ xfs_dir2_leaf_replace( return error; } dp = args->dp; - leaf = lbp->data; + leaf = lbp->b_addr; + ents = dp->d_ops->leaf_ents_p(leaf); /* * Point to the leaf entry, get data address from it. */ - lep = &leaf->ents[index]; + lep = &ents[index]; /* * Point to the data entry. */ dep = (xfs_dir2_data_entry_t *) - ((char *)dbp->data + - XFS_DIR2_DATAPTR_TO_OFF(dp->i_mount, be32_to_cpu(lep->address))); + ((char *)dbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address))); ASSERT(args->inumber != be64_to_cpu(dep->inumber)); /* * Put the new inode number in, log it. */ dep->inumber = cpu_to_be64(args->inumber); + dp->d_ops->data_put_ftype(dep, args->filetype); tp = args->trans; - xfs_dir2_data_log_entry(tp, dbp, dep); - xfs_da_buf_done(dbp); - xfs_dir2_leaf_check(dp, lbp); - xfs_da_brelse(tp, lbp); + xfs_dir2_data_log_entry(args, dbp, dep); + xfs_dir3_leaf_check(dp, lbp); + xfs_trans_brelse(tp, lbp); return 0; } @@ -1625,7 +1544,7 @@ xfs_dir2_leaf_replace( int /* index value */ xfs_dir2_leaf_search_hash( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_dahash_t hash=0; /* hash from this entry */ xfs_dahash_t hashwant; /* hash value looking for */ @@ -1634,17 +1553,18 @@ xfs_dir2_leaf_search_hash( xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ int mid=0; /* current leaf index */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + leaf = lbp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); - leaf = lbp->data; -#ifndef __KERNEL__ - if (!leaf->hdr.count) - return 0; -#endif /* * Note, the table cannot be empty, so we have to go through the loop. * Binary search the leaf entries looking for our hash value. */ - for (lep = leaf->ents, low = 0, high = be16_to_cpu(leaf->hdr.count) - 1, + for (lep = ents, low = 0, high = leafhdr.count - 1, hashwant = args->hashval; low <= high; ) { mid = (low + high) >> 1; @@ -1678,14 +1598,11 @@ xfs_dir2_leaf_search_hash( int /* error */ xfs_dir2_leaf_trim_data( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp, /* leaf buffer */ + struct xfs_buf *lbp, /* leaf buffer */ xfs_dir2_db_t db) /* data block number */ { __be16 *bestsp; /* leaf bests table */ -#ifdef DEBUG - xfs_dir2_data_t *data; /* data block structure */ -#endif - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -1699,43 +1616,66 @@ xfs_dir2_leaf_trim_data( /* * Read the offending data block. We need its buffer. */ - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, db), -1, &dbp, - XFS_DATA_FORK))) { + error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db), + -1, &dbp); + if (error) return error; - } + + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + #ifdef DEBUG - data = dbp->data; - ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); -#endif - /* this seems to be an error - * data is only valid if DEBUG is defined? - * RMC 09/08/1999 - */ +{ + struct xfs_dir2_data_hdr *hdr = dbp->b_addr; + struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr); - leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) == - mp->m_dirblksize - (uint)sizeof(data->hdr)); + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); + ASSERT(be16_to_cpu(bf[0].length) == + args->geo->blksize - dp->d_ops->data_entry_offset); ASSERT(db == be32_to_cpu(ltp->bestcount) - 1); +} +#endif + /* * Get rid of the data block. */ if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { ASSERT(error != ENOSPC); - xfs_da_brelse(tp, dbp); + xfs_trans_brelse(tp, dbp); return error; } /* * Eliminate the last bests entry from the table. */ - bestsp = XFS_DIR2_LEAF_BESTS_P(ltp); - be32_add(<p->bestcount, -1); + bestsp = xfs_dir2_leaf_bests_p(ltp); + be32_add_cpu(<p->bestcount, -1); memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp)); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); return 0; } +static inline size_t +xfs_dir3_leaf_size( + struct xfs_dir3_icleaf_hdr *hdr, + int counts) +{ + int entries; + int hdrsize; + + entries = hdr->count - hdr->stale; + if (hdr->magic == XFS_DIR2_LEAF1_MAGIC || + hdr->magic == XFS_DIR2_LEAFN_MAGIC) + hdrsize = sizeof(struct xfs_dir2_leaf_hdr); + else + hdrsize = sizeof(struct xfs_dir3_leaf_hdr); + + return hdrsize + entries * sizeof(xfs_dir2_leaf_entry_t) + + counts * sizeof(xfs_dir2_data_off_t) + + sizeof(xfs_dir2_leaf_tail_t); +} + /* * Convert node form directory to leaf form directory. * The root of the node form dir needs to already be a LEAFN block. @@ -1748,15 +1688,17 @@ xfs_dir2_node_to_leaf( xfs_da_args_t *args; /* operation arguments */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ - xfs_dabuf_t *fbp; /* buffer for freespace block */ + struct xfs_buf *fbp; /* buffer for freespace block */ xfs_fileoff_t fo; /* freespace file offset */ xfs_dir2_free_t *free; /* freespace structure */ - xfs_dabuf_t *lbp; /* buffer for leaf block */ + struct xfs_buf *lbp; /* buffer for leaf block */ xfs_dir2_leaf_tail_t *ltp; /* tail of leaf structure */ xfs_dir2_leaf_t *leaf; /* leaf structure */ xfs_mount_t *mp; /* filesystem mount point */ int rval; /* successful free trim? */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir3_icfree_hdr freehdr; /* * There's more than a leaf level in the btree, so there must @@ -1765,29 +1707,31 @@ xfs_dir2_node_to_leaf( if (state->path.active > 1) return 0; args = state->args; - xfs_dir2_trace_args("node_to_leaf", args); + + trace_xfs_dir2_node_to_leaf(args); + mp = state->mp; dp = args->dp; tp = args->trans; /* * Get the last offset in the file. */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) { + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) { return error; } - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; /* * If there are freespace blocks other than the first one, * take this opportunity to remove trailing empty freespace blocks * that may have been left behind during no-space-reservation * operations. */ - while (fo > mp->m_dirfreeblk) { + while (fo > args->geo->freeblk) { if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) { return error; } if (rval) - fo -= mp->m_dirblkfsbs; + fo -= args->geo->fsbcount; else return 0; } @@ -1800,59 +1744,71 @@ xfs_dir2_node_to_leaf( /* * If it's not the single leaf block, give up. */ - if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize) + if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize) return 0; lbp = state->path.blk[0].bp; - leaf = lbp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + leaf = lbp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); + /* * Read the freespace block. */ - if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp); + if (error) return error; - } - free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - ASSERT(!free->hdr.firstdb); + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + + ASSERT(!freehdr.firstdb); + /* * Now see if the leafn and free data will fit in a leaf1. * If not, release the buffer and give up. */ - if ((uint)sizeof(leaf->hdr) + - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale)) * (uint)sizeof(leaf->ents[0]) + - be32_to_cpu(free->hdr.nvalid) * (uint)sizeof(leaf->bests[0]) + - (uint)sizeof(leaf->tail) > - mp->m_dirblksize) { - xfs_da_brelse(tp, fbp); + if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) { + xfs_trans_brelse(tp, fbp); return 0; } + /* * If the leaf has any stale entries in it, compress them out. - * The compact routine will log the header. */ - if (be16_to_cpu(leaf->hdr.stale)) - xfs_dir2_leaf_compact(args, lbp); - else - xfs_dir2_leaf_log_header(tp, lbp); - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC); + if (leafhdr.stale) + xfs_dir3_leaf_compact(args, &leafhdr, lbp); + + lbp->b_ops = &xfs_dir3_leaf1_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAF1_BUF); + leafhdr.magic = (leafhdr.magic == XFS_DIR2_LEAFN_MAGIC) + ? XFS_DIR2_LEAF1_MAGIC + : XFS_DIR3_LEAF1_MAGIC; + /* * Set up the leaf tail from the freespace block. */ - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - ltp->bestcount = free->hdr.nvalid; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ltp->bestcount = cpu_to_be32(freehdr.nvalid); + /* * Set up the leaf bests table. */ - memcpy(XFS_DIR2_LEAF_BESTS_P(ltp), free->bests, - be32_to_cpu(ltp->bestcount) * sizeof(leaf->bests[0])); - xfs_dir2_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); - xfs_dir2_leaf_log_tail(tp, lbp); - xfs_dir2_leaf_check(dp, lbp); + memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free), + freehdr.nvalid * sizeof(xfs_dir2_data_off_t)); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1); + xfs_dir3_leaf_log_tail(args, lbp); + xfs_dir3_leaf_check(dp, lbp); + /* * Get rid of the freespace block. */ - error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp); + error = xfs_dir2_shrink_inode(args, + xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET), + fbp); if (error) { /* * This can't fail here because it can only happen when diff --git a/fs/xfs/xfs_dir2_leaf.h b/fs/xfs/xfs_dir2_leaf.h deleted file mode 100644 index f57ca116241..00000000000 --- a/fs/xfs/xfs_dir2_leaf.h +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_LEAF_H__ -#define __XFS_DIR2_LEAF_H__ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * Offset of the leaf/node space. First block in this space - * is the btree root. - */ -#define XFS_DIR2_LEAF_SPACE 1 -#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_LEAF_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_LEAF_OFFSET) - -/* - * Offset in data space of a data entry. - */ -typedef __uint32_t xfs_dir2_dataptr_t; -#define XFS_DIR2_MAX_DATAPTR ((xfs_dir2_dataptr_t)0xffffffff) -#define XFS_DIR2_NULL_DATAPTR ((xfs_dir2_dataptr_t)0) - -/* - * Leaf block header. - */ -typedef struct xfs_dir2_leaf_hdr { - xfs_da_blkinfo_t info; /* header for da routines */ - __be16 count; /* count of entries */ - __be16 stale; /* count of stale entries */ -} xfs_dir2_leaf_hdr_t; - -/* - * Leaf block entry. - */ -typedef struct xfs_dir2_leaf_entry { - __be32 hashval; /* hash value of name */ - __be32 address; /* address of data entry */ -} xfs_dir2_leaf_entry_t; - -/* - * Leaf block tail. - */ -typedef struct xfs_dir2_leaf_tail { - __be32 bestcount; -} xfs_dir2_leaf_tail_t; - -/* - * Leaf block. - * bests and tail are at the end of the block for single-leaf only - * (magic = XFS_DIR2_LEAF1_MAGIC not XFS_DIR2_LEAFN_MAGIC). - */ -typedef struct xfs_dir2_leaf { - xfs_dir2_leaf_hdr_t hdr; /* leaf header */ - xfs_dir2_leaf_entry_t ents[1]; /* entries */ - /* ... */ - xfs_dir2_data_off_t bests[1]; /* best free counts */ - xfs_dir2_leaf_tail_t tail; /* leaf tail */ -} xfs_dir2_leaf_t; - -/* - * DB blocks here are logical directory block numbers, not filesystem blocks. - */ - -#define XFS_DIR2_MAX_LEAF_ENTS(mp) xfs_dir2_max_leaf_ents(mp) -static inline int xfs_dir2_max_leaf_ents(struct xfs_mount *mp) -{ - return (int)(((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_leaf_hdr_t)) / - (uint)sizeof(xfs_dir2_leaf_entry_t)); -} - -/* - * Get address of the bestcount field in the single-leaf block. - */ -#define XFS_DIR2_LEAF_TAIL_P(mp,lp) xfs_dir2_leaf_tail_p(mp, lp) -static inline xfs_dir2_leaf_tail_t * -xfs_dir2_leaf_tail_p(struct xfs_mount *mp, xfs_dir2_leaf_t *lp) -{ - return (xfs_dir2_leaf_tail_t *) - ((char *)(lp) + (mp)->m_dirblksize - - (uint)sizeof(xfs_dir2_leaf_tail_t)); -} - -/* - * Get address of the bests array in the single-leaf block. - */ -#define XFS_DIR2_LEAF_BESTS_P(ltp) xfs_dir2_leaf_bests_p(ltp) -static inline __be16 * -xfs_dir2_leaf_bests_p(xfs_dir2_leaf_tail_t *ltp) -{ - return (__be16 *)ltp - be32_to_cpu(ltp->bestcount); -} - -/* - * Convert dataptr to byte in file space - */ -#define XFS_DIR2_DATAPTR_TO_BYTE(mp,dp) xfs_dir2_dataptr_to_byte(mp, dp) -static inline xfs_dir2_off_t -xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return (xfs_dir2_off_t)(dp) << XFS_DIR2_DATA_ALIGN_LOG; -} - -/* - * Convert byte in file space to dataptr. It had better be aligned. - */ -#define XFS_DIR2_BYTE_TO_DATAPTR(mp,by) xfs_dir2_byte_to_dataptr(mp,by) -static inline xfs_dir2_dataptr_t -xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_dataptr_t)((by) >> XFS_DIR2_DATA_ALIGN_LOG); -} - -/* - * Convert byte in space to (DB) block - */ -#define XFS_DIR2_BYTE_TO_DB(mp,by) xfs_dir2_byte_to_db(mp, by) -static inline xfs_dir2_db_t -xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_db_t)((by) >> \ - ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)); -} - -/* - * Convert dataptr to a block number - */ -#define XFS_DIR2_DATAPTR_TO_DB(mp,dp) xfs_dir2_dataptr_to_db(mp, dp) -static inline xfs_dir2_db_t -xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); -} - -/* - * Convert byte in space to offset in a block - */ -#define XFS_DIR2_BYTE_TO_OFF(mp,by) xfs_dir2_byte_to_off(mp, by) -static inline xfs_dir2_data_aoff_t -xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return (xfs_dir2_data_aoff_t)((by) & \ - ((1 << ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) - 1)); -} - -/* - * Convert dataptr to a byte offset in a block - */ -#define XFS_DIR2_DATAPTR_TO_OFF(mp,dp) xfs_dir2_dataptr_to_off(mp, dp) -static inline xfs_dir2_data_aoff_t -xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp) -{ - return XFS_DIR2_BYTE_TO_OFF(mp, XFS_DIR2_DATAPTR_TO_BYTE(mp, dp)); -} - -/* - * Convert block and offset to byte in space - */ -#define XFS_DIR2_DB_OFF_TO_BYTE(mp,db,o) \ - xfs_dir2_db_off_to_byte(mp, db, o) -static inline xfs_dir2_off_t -xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return ((xfs_dir2_off_t)(db) << \ - ((mp)->m_sb.sb_blocklog + (mp)->m_sb.sb_dirblklog)) + (o); -} - -/* - * Convert block (DB) to block (dablk) - */ -#define XFS_DIR2_DB_TO_DA(mp,db) xfs_dir2_db_to_da(mp, db) -static inline xfs_dablk_t -xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (xfs_dablk_t)((db) << (mp)->m_sb.sb_dirblklog); -} - -/* - * Convert byte in space to (DA) block - */ -#define XFS_DIR2_BYTE_TO_DA(mp,by) xfs_dir2_byte_to_da(mp, by) -static inline xfs_dablk_t -xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by) -{ - return XFS_DIR2_DB_TO_DA(mp, XFS_DIR2_BYTE_TO_DB(mp, by)); -} - -/* - * Convert block and offset to dataptr - */ -#define XFS_DIR2_DB_OFF_TO_DATAPTR(mp,db,o) \ - xfs_dir2_db_off_to_dataptr(mp, db, o) -static inline xfs_dir2_dataptr_t -xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db, - xfs_dir2_data_aoff_t o) -{ - return XFS_DIR2_BYTE_TO_DATAPTR(mp, XFS_DIR2_DB_OFF_TO_BYTE(mp, db, o)); -} - -/* - * Convert block (dablk) to block (DB) - */ -#define XFS_DIR2_DA_TO_DB(mp,da) xfs_dir2_da_to_db(mp, da) -static inline xfs_dir2_db_t -xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da) -{ - return (xfs_dir2_db_t)((da) >> (mp)->m_sb.sb_dirblklog); -} - -/* - * Convert block (dablk) to byte offset in space - */ -#define XFS_DIR2_DA_TO_BYTE(mp,da) xfs_dir2_da_to_byte(mp, da) -static inline xfs_dir2_off_t -xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da) -{ - return XFS_DIR2_DB_OFF_TO_BYTE(mp, XFS_DIR2_DA_TO_DB(mp, da), 0); -} - -/* - * Function declarations. - */ -extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, - struct xfs_dabuf *dbp); -extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); -extern void xfs_dir2_leaf_compact(struct xfs_da_args *args, - struct xfs_dabuf *bp); -extern void xfs_dir2_leaf_compact_x1(struct xfs_dabuf *bp, int *indexp, - int *lowstalep, int *highstalep, - int *lowlogp, int *highlogp); -extern int xfs_dir2_leaf_getdents(struct xfs_trans *tp, struct xfs_inode *dp, - struct uio *uio, int *eofp, - struct xfs_dirent *dbp, xfs_dir2_put_t put); -extern int xfs_dir2_leaf_init(struct xfs_da_args *args, xfs_dir2_db_t bno, - struct xfs_dabuf **bpp, int magic); -extern void xfs_dir2_leaf_log_ents(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); -extern void xfs_dir2_leaf_log_header(struct xfs_trans *tp, - struct xfs_dabuf *bp); -extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); -extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); -extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); -extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, - struct xfs_dabuf *lbp, xfs_dir2_db_t db); -extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); - -#endif /* __XFS_DIR2_LEAF_H__ */ diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 9ca71719b68..da43d304fca 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,68 +18,245 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_bmap.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_node.h" -#include "xfs_dir2_trace.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_cksum.h" /* * Function declarations. */ -static void xfs_dir2_free_log_header(xfs_trans_t *tp, xfs_dabuf_t *bp); -static int xfs_dir2_leafn_add(xfs_dabuf_t *bp, xfs_da_args_t *args, int index); -#ifdef DEBUG -static void xfs_dir2_leafn_check(xfs_inode_t *dp, xfs_dabuf_t *bp); -#else -#define xfs_dir2_leafn_check(dp, bp) -#endif -static void xfs_dir2_leafn_moveents(xfs_da_args_t *args, xfs_dabuf_t *bp_s, - int start_s, xfs_dabuf_t *bp_d, int start_d, - int count); +static int xfs_dir2_leafn_add(struct xfs_buf *bp, xfs_da_args_t *args, + int index); static void xfs_dir2_leafn_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1, xfs_da_state_blk_t *blk2); -static int xfs_dir2_leafn_remove(xfs_da_args_t *args, xfs_dabuf_t *bp, +static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp, int index, xfs_da_state_blk_t *dblk, int *rval); static int xfs_dir2_node_addname_int(xfs_da_args_t *args, xfs_da_state_blk_t *fblk); /* + * Check internal consistency of a leafn block. + */ +#ifdef DEBUG +#define xfs_dir3_leaf_check(dp, bp) \ +do { \ + if (!xfs_dir3_leafn_check((dp), (bp))) \ + ASSERT(0); \ +} while (0); + +static bool +xfs_dir3_leafn_check( + struct xfs_inode *dp, + struct xfs_buf *bp) +{ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { + struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; + if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + return false; + } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) + return false; + + return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf); +} +#else +#define xfs_dir3_leaf_check(dp, bp) +#endif + +static bool +xfs_dir3_free_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dir2_free_hdr *hdr = bp->b_addr; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (hdr3->magic != cpu_to_be32(XFS_DIR3_FREE_MAGIC)) + return false; + if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_uuid)) + return false; + if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + return false; + } else { + if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) + return false; + } + + /* XXX: should bounds check the xfs_dir3_icfree_hdr here */ + + return true; +} + +static void +xfs_dir3_free_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dir3_free_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_dir3_free_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; + + if (!xfs_dir3_free_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF); +} + +const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .verify_read = xfs_dir3_free_read_verify, + .verify_write = xfs_dir3_free_write_verify, +}; + + +static int +__xfs_dir3_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + xfs_daddr_t mappedbno, + struct xfs_buf **bpp) +{ + int err; + + err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp, + XFS_DATA_FORK, &xfs_dir3_free_buf_ops); + + /* try read returns without an error or *bpp if it lands in a hole */ + if (!err && tp && *bpp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF); + return err; +} + +int +xfs_dir2_free_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -1, bpp); +} + +static int +xfs_dir2_free_try_read( + struct xfs_trans *tp, + struct xfs_inode *dp, + xfs_dablk_t fbno, + struct xfs_buf **bpp) +{ + return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp); +} + +static int +xfs_dir3_free_get_buf( + xfs_da_args_t *args, + xfs_dir2_db_t fbno, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp = args->trans; + struct xfs_inode *dp = args->dp; + struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *bp; + int error; + struct xfs_dir3_icfree_hdr hdr; + + error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno), + -1, &bp, XFS_DATA_FORK); + if (error) + return error; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_FREE_BUF); + bp->b_ops = &xfs_dir3_free_buf_ops; + + /* + * Initialize the new block to be empty, and remember + * its first slot as our empty slot. + */ + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; + + hdr.magic = XFS_DIR3_FREE_MAGIC; + + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.owner = cpu_to_be64(dp->i_ino); + uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; + dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr); + *bpp = bp; + return 0; +} + +/* * Log entries from a freespace block. */ -void +STATIC void xfs_dir2_free_log_bests( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp, /* freespace buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp, int first, /* first entry to log */ int last) /* last entry to log */ { xfs_dir2_free_t *free; /* freespace structure */ + __be16 *bests; - free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - xfs_da_log_buf(tp, bp, - (uint)((char *)&free->bests[first] - (char *)free), - (uint)((char *)&free->bests[last] - (char *)free + - sizeof(free->bests[0]) - 1)); + free = bp->b_addr; + bests = args->dp->d_ops->free_bests_p(free); + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); + xfs_trans_log_buf(args->trans, bp, + (uint)((char *)&bests[first] - (char *)free), + (uint)((char *)&bests[last] - (char *)free + + sizeof(bests[0]) - 1)); } /* @@ -86,15 +264,18 @@ xfs_dir2_free_log_bests( */ static void xfs_dir2_free_log_header( - xfs_trans_t *tp, /* transaction pointer */ - xfs_dabuf_t *bp) /* freespace buffer */ + struct xfs_da_args *args, + struct xfs_buf *bp) { +#ifdef DEBUG xfs_dir2_free_t *free; /* freespace structure */ - free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - xfs_da_log_buf(tp, bp, (uint)((char *)&free->hdr - (char *)free), - (uint)(sizeof(xfs_dir2_free_hdr_t) - 1)); + free = bp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); +#endif + xfs_trans_log_buf(args->trans, bp, 0, + args->dp->d_ops->free_hdr_size - 1); } /* @@ -105,11 +286,11 @@ xfs_dir2_free_log_header( int /* error */ xfs_dir2_leaf_to_node( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *lbp) /* leaf buffer */ + struct xfs_buf *lbp) /* leaf buffer */ { xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ xfs_dir2_db_t fdb; /* freespace block number */ xfs_dir2_free_t *free; /* freespace structure */ __be16 *from; /* pointer to freespace entry */ @@ -121,8 +302,10 @@ xfs_dir2_leaf_to_node( xfs_dir2_data_off_t off; /* freespace entry value */ __be16 *to; /* pointer to freespace entry */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; + + trace_xfs_dir2_leaf_to_node(args); - xfs_dir2_trace_args_b("leaf_to_node", args, lbp); dp = args->dp; mp = dp->i_mount; tp = args->trans; @@ -132,45 +315,57 @@ xfs_dir2_leaf_to_node( if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) { return error; } - ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp)); + ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET)); /* * Get the buffer for the new freespace block. */ - if ((error = xfs_da_get_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), -1, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir3_free_get_buf(args, fdb, &fbp); + if (error) return error; - } - ASSERT(fbp != NULL); - free = fbp->data; - leaf = lbp->data; - ltp = XFS_DIR2_LEAF_TAIL_P(mp, leaf); - /* - * Initialize the freespace block header. - */ - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = 0; - ASSERT(be32_to_cpu(ltp->bestcount) <= (uint)dp->i_d.di_size / mp->m_dirblksize); - free->hdr.nvalid = ltp->bestcount; + + free = fbp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + leaf = lbp->b_addr; + ltp = xfs_dir2_leaf_tail_p(args->geo, leaf); + ASSERT(be32_to_cpu(ltp->bestcount) <= + (uint)dp->i_d.di_size / args->geo->blksize); + /* * Copy freespace entries from the leaf block to the new block. * Count active entries. */ - for (i = n = 0, from = XFS_DIR2_LEAF_BESTS_P(ltp), to = free->bests; - i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { + from = xfs_dir2_leaf_bests_p(ltp); + to = dp->d_ops->free_bests_p(free); + for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) { if ((off = be16_to_cpu(*from)) != NULLDATAOFF) n++; *to = cpu_to_be16(off); } - free->hdr.nused = cpu_to_be32(n); - leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + + /* + * Now initialize the freespace block header. + */ + freehdr.nused = n; + freehdr.nvalid = be32_to_cpu(ltp->bestcount); + + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1); + xfs_dir2_free_log_header(args, fbp); + /* - * Log everything. + * Converting the leaf to a leafnode is just a matter of changing the + * magic number and the ops. Do the change directly to the buffer as + * it's less work (and less code) than decoding the header to host + * format and back again. */ - xfs_dir2_leaf_log_header(tp, lbp); - xfs_dir2_free_log_header(tp, fbp); - xfs_dir2_free_log_bests(tp, fbp, 0, be32_to_cpu(free->hdr.nvalid) - 1); - xfs_da_buf_done(fbp); - xfs_dir2_leafn_check(dp, lbp); + if (leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC)) + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC); + else + leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); + lbp->b_ops = &xfs_dir3_leafn_buf_ops; + xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF); + xfs_dir3_leaf_log_header(args, lbp); + xfs_dir3_leaf_check(dp, lbp); return 0; } @@ -180,7 +375,7 @@ xfs_dir2_leaf_to_node( */ static int /* error */ xfs_dir2_leafn_add( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int index) /* insertion pt for new entry */ { @@ -194,12 +389,17 @@ xfs_dir2_leafn_add( int lowstale; /* previous stale entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_add(args, index); - xfs_dir2_trace_args_sb("leafn_add", args, index, bp); dp = args->dp; mp = dp->i_mount; tp = args->trans; - leaf = bp->data; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); /* * Quick check just to make sure we are not going to index @@ -215,154 +415,69 @@ xfs_dir2_leafn_add( * a compact. */ - if (be16_to_cpu(leaf->hdr.count) == XFS_DIR2_MAX_LEAF_ENTS(mp)) { - if (!leaf->hdr.stale) + if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { + if (!leafhdr.stale) return XFS_ERROR(ENOSPC); - compact = be16_to_cpu(leaf->hdr.stale) > 1; + compact = leafhdr.stale > 1; } else compact = 0; - ASSERT(index == 0 || be32_to_cpu(leaf->ents[index - 1].hashval) <= args->hashval); - ASSERT(index == be16_to_cpu(leaf->hdr.count) || - be32_to_cpu(leaf->ents[index].hashval) >= args->hashval); + ASSERT(index == 0 || be32_to_cpu(ents[index - 1].hashval) <= args->hashval); + ASSERT(index == leafhdr.count || + be32_to_cpu(ents[index].hashval) >= args->hashval); - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* * Compact out all but one stale leaf entry. Leaves behind * the entry closest to index. */ - if (compact) { - xfs_dir2_leaf_compact_x1(bp, &index, &lowstale, &highstale, - &lfloglow, &lfloghigh); - } - /* - * Set impossible logging indices for this case. - */ - else if (leaf->hdr.stale) { - lfloglow = be16_to_cpu(leaf->hdr.count); - lfloghigh = -1; - } - /* - * No stale entries, just insert a space for the new entry. - */ - if (!leaf->hdr.stale) { - lep = &leaf->ents[index]; - if (index < be16_to_cpu(leaf->hdr.count)) - memmove(lep + 1, lep, - (be16_to_cpu(leaf->hdr.count) - index) * sizeof(*lep)); - lfloglow = index; - lfloghigh = be16_to_cpu(leaf->hdr.count); - be16_add(&leaf->hdr.count, 1); - } - /* - * There are stale entries. We'll use one for the new entry. - */ - else { - /* - * If we didn't do a compact then we need to figure out - * which stale entry will be used. - */ - if (compact == 0) { - /* - * Find first stale entry before our insertion point. - */ - for (lowstale = index - 1; - lowstale >= 0 && - be32_to_cpu(leaf->ents[lowstale].address) != - XFS_DIR2_NULL_DATAPTR; - lowstale--) - continue; - /* - * Find next stale entry after insertion point. - * Stop looking if the answer would be worse than - * lowstale already found. - */ - for (highstale = index; - highstale < be16_to_cpu(leaf->hdr.count) && - be32_to_cpu(leaf->ents[highstale].address) != - XFS_DIR2_NULL_DATAPTR && - (lowstale < 0 || - index - lowstale - 1 >= highstale - index); - highstale++) - continue; - } + if (compact) + xfs_dir3_leaf_compact_x1(&leafhdr, ents, &index, &lowstale, + &highstale, &lfloglow, &lfloghigh); + else if (leafhdr.stale) { /* - * Using the low stale entry. - * Shift entries up toward the stale slot. + * Set impossible logging indices for this case. */ - if (lowstale >= 0 && - (highstale == be16_to_cpu(leaf->hdr.count) || - index - lowstale - 1 < highstale - index)) { - ASSERT(be32_to_cpu(leaf->ents[lowstale].address) == - XFS_DIR2_NULL_DATAPTR); - ASSERT(index - lowstale - 1 >= 0); - if (index - lowstale - 1 > 0) - memmove(&leaf->ents[lowstale], - &leaf->ents[lowstale + 1], - (index - lowstale - 1) * sizeof(*lep)); - lep = &leaf->ents[index - 1]; - lfloglow = MIN(lowstale, lfloglow); - lfloghigh = MAX(index - 1, lfloghigh); - } - /* - * Using the high stale entry. - * Shift entries down toward the stale slot. - */ - else { - ASSERT(be32_to_cpu(leaf->ents[highstale].address) == - XFS_DIR2_NULL_DATAPTR); - ASSERT(highstale - index >= 0); - if (highstale - index > 0) - memmove(&leaf->ents[index + 1], - &leaf->ents[index], - (highstale - index) * sizeof(*lep)); - lep = &leaf->ents[index]; - lfloglow = MIN(index, lfloglow); - lfloghigh = MAX(highstale, lfloghigh); - } - be16_add(&leaf->hdr.stale, -1); + lfloglow = leafhdr.count; + lfloghigh = -1; } + /* * Insert the new entry, log everything. */ + lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale, + highstale, &lfloglow, &lfloghigh); + lep->hashval = cpu_to_be32(args->hashval); - lep->address = cpu_to_be32(XFS_DIR2_DB_OFF_TO_DATAPTR(mp, + lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo, args->blkno, args->index)); - xfs_dir2_leaf_log_header(tp, bp); - xfs_dir2_leaf_log_ents(tp, bp, lfloglow, lfloghigh); - xfs_dir2_leafn_check(dp, bp); + + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh); + xfs_dir3_leaf_check(dp, bp); return 0; } #ifdef DEBUG -/* - * Check internal consistency of a leafn block. - */ -void -xfs_dir2_leafn_check( - xfs_inode_t *dp, /* incore directory inode */ - xfs_dabuf_t *bp) /* leaf buffer */ +static void +xfs_dir2_free_hdr_check( + struct xfs_inode *dp, + struct xfs_buf *bp, + xfs_dir2_db_t db) { - int i; /* leaf index */ - xfs_dir2_leaf_t *leaf; /* leaf structure */ - xfs_mount_t *mp; /* filesystem mount point */ - int stale; /* count of stale leaves */ + struct xfs_dir3_icfree_hdr hdr; - leaf = bp->data; - mp = dp->i_mount; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(leaf->hdr.count) <= XFS_DIR2_MAX_LEAF_ENTS(mp)); - for (i = stale = 0; i < be16_to_cpu(leaf->hdr.count); i++) { - if (i + 1 < be16_to_cpu(leaf->hdr.count)) { - ASSERT(be32_to_cpu(leaf->ents[i].hashval) <= - be32_to_cpu(leaf->ents[i + 1].hashval)); - } - if (be32_to_cpu(leaf->ents[i].address) == XFS_DIR2_NULL_DATAPTR) - stale++; - } - ASSERT(be16_to_cpu(leaf->hdr.stale) == stale); + dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr); + + ASSERT((hdr.firstdb % + dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0); + ASSERT(hdr.firstdb <= db); + ASSERT(db < hdr.firstdb + hdr.nvalid); } +#else +#define xfs_dir2_free_hdr_check(dp, bp, db) #endif /* DEBUG */ /* @@ -371,58 +486,67 @@ xfs_dir2_leafn_check( */ xfs_dahash_t /* hash value */ xfs_dir2_leafn_lasthash( - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_inode *dp, + struct xfs_buf *bp, /* leaf buffer */ int *count) /* count of entries in leaf */ { - xfs_dir2_leaf_t *leaf; /* leaf structure */ + struct xfs_dir2_leaf *leaf = bp->b_addr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + + ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC || + leafhdr.magic == XFS_DIR3_LEAFN_MAGIC); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); if (count) - *count = be16_to_cpu(leaf->hdr.count); - if (!leaf->hdr.count) + *count = leafhdr.count; + if (!leafhdr.count) return 0; - return be32_to_cpu(leaf->ents[be16_to_cpu(leaf->hdr.count) - 1].hashval); + + ents = dp->d_ops->leaf_ents_p(leaf); + return be32_to_cpu(ents[leafhdr.count - 1].hashval); } /* - * Look up a leaf entry in a node-format leaf block. - * If this is an addname then the extrablk in state is a freespace block, - * otherwise it's a data block. + * Look up a leaf entry for space to add a name in a node-format leaf block. + * The extrablk in state is a freespace block. */ -int -xfs_dir2_leafn_lookup_int( - xfs_dabuf_t *bp, /* leaf buffer */ +STATIC int +xfs_dir2_leafn_lookup_for_addname( + struct xfs_buf *bp, /* leaf buffer */ xfs_da_args_t *args, /* operation arguments */ int *indexp, /* out: leaf entry index */ xfs_da_state_t *state) /* state to fill in */ { - xfs_dabuf_t *curbp; /* current data/free buffer */ - xfs_dir2_db_t curdb; /* current data block number */ - xfs_dir2_db_t curfdb; /* current free block number */ - xfs_dir2_data_entry_t *dep; /* data block entry */ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_db_t curfdb = -1; /* current free block number */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int fi; /* free entry index */ - xfs_dir2_free_t *free=NULL; /* free block structure */ + xfs_dir2_free_t *free = NULL; /* free block structure */ int index; /* leaf entry index */ xfs_dir2_leaf_t *leaf; /* leaf structure */ - int length=0; /* length of new data entry */ + int length; /* length of new data entry */ xfs_dir2_leaf_entry_t *lep; /* leaf entry */ xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_db_t newdb; /* new data block number */ xfs_dir2_db_t newfdb; /* new free block number */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); -#ifdef __KERNEL__ - ASSERT(be16_to_cpu(leaf->hdr.count) > 0); -#endif - xfs_dir2_leafn_check(dp, bp); + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + /* * Look up the hash value in the leaf entries. */ @@ -430,32 +554,20 @@ xfs_dir2_leafn_lookup_int( /* * Do we have a buffer coming in? */ - if (state->extravalid) + if (state->extravalid) { + /* If so, it's a free block buffer, get the block number. */ curbp = state->extrablk.bp; - else - curbp = NULL; - /* - * For addname, it's a free block buffer, get the block number. - */ - if (args->addname) { - curfdb = curbp ? state->extrablk.blkno : -1; - curdb = -1; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); - if ((free = (curbp ? curbp->data : NULL))) - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - } - /* - * For others, it's a data block buffer, get the block number. - */ - else { - curfdb = -1; - curdb = curbp ? state->extrablk.blkno : -1; + curfdb = state->extrablk.blkno; + free = curbp->b_addr; + ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) || + free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC)); } + length = dp->d_ops->data_entsize(args->namelen); /* * Loop over leaf entries with the right hash value. */ - for (lep = &leaf->ents[index]; - index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval; + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; lep++, index++) { /* * Skip stale leaf entries. @@ -465,215 +577,310 @@ xfs_dir2_leafn_lookup_int( /* * Pull the data block number from the entry. */ - newdb = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); /* * For addname, we're looking for a place to put the new entry. * We want to use a data block with an entry of equal * hash value to ours if there is one with room. + * + * If this block isn't the data block we already have + * in hand, take a look at it. */ - if (args->addname) { + if (newdb != curdb) { + __be16 *bests; + + curdb = newdb; /* - * If this block isn't the data block we already have - * in hand, take a look at it. + * Convert the data block to the free block + * holding its freespace information. */ - if (newdb != curdb) { - curdb = newdb; - /* - * Convert the data block to the free block - * holding its freespace information. - */ - newfdb = XFS_DIR2_DB_TO_FDB(mp, newdb); - /* - * If it's not the one we have in hand, - * read it in. - */ - if (newfdb != curfdb) { - /* - * If we had one before, drop it. - */ - if (curbp) - xfs_da_brelse(tp, curbp); - /* - * Read the free block. - */ - if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, - newfdb), - -1, &curbp, - XFS_DATA_FORK))) { - return error; - } - free = curbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == - XFS_DIR2_FREE_MAGIC); - ASSERT((be32_to_cpu(free->hdr.firstdb) % - XFS_DIR2_MAX_FREE_BESTS(mp)) == - 0); - ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb); - ASSERT(curdb < - be32_to_cpu(free->hdr.firstdb) + - be32_to_cpu(free->hdr.nvalid)); - } - /* - * Get the index for our entry. - */ - fi = XFS_DIR2_DB_TO_FDINDEX(mp, curdb); - /* - * If it has room, return it. - */ - if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) { - XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", - XFS_ERRLEVEL_LOW, mp); - if (curfdb != newfdb) - xfs_da_brelse(tp, curbp); - return XFS_ERROR(EFSCORRUPTED); - } - curfdb = newfdb; - if (be16_to_cpu(free->bests[fi]) >= length) { - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curfdb; - state->extrablk.index = fi; - state->extrablk.magic = - XFS_DIR2_FREE_MAGIC; - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); - } - } - } - /* - * Not adding a new entry, so we really want to find - * the name given to us. - */ - else { + newfdb = dp->d_ops->db_to_fdb(args->geo, newdb); /* - * If it's a different data block, go get it. + * If it's not the one we have in hand, read it in. */ - if (newdb != curdb) { + if (newfdb != curfdb) { /* - * If we had a block before, drop it. + * If we had one before, drop it. */ if (curbp) - xfs_da_brelse(tp, curbp); - /* - * Read the data block. - */ - if ((error = - xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, newdb), -1, - &curbp, XFS_DATA_FORK))) { + xfs_trans_brelse(tp, curbp); + + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newfdb), + &curbp); + if (error) return error; - } - xfs_dir2_data_check(dp, curbp); - curdb = newdb; + free = curbp->b_addr; + + xfs_dir2_free_hdr_check(dp, curbp, curdb); } /* - * Point to the data entry. + * Get the index for our entry. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)curbp->data + - XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address))); + fi = dp->d_ops->db_to_fdindex(args->geo, curdb); /* - * Compare the entry, return it if it matches. + * If it has room, return it. */ - if (dep->namelen == args->namelen && - dep->name[0] == args->name[0] && - memcmp(dep->name, args->name, args->namelen) == 0) { - args->inumber = be64_to_cpu(dep->inumber); - *indexp = index; - state->extravalid = 1; - state->extrablk.bp = curbp; - state->extrablk.blkno = curdb; - state->extrablk.index = - (int)((char *)dep - - (char *)curbp->data); - state->extrablk.magic = XFS_DIR2_DATA_MAGIC; - return XFS_ERROR(EEXIST); + bests = dp->d_ops->free_bests_p(free); + if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) { + XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int", + XFS_ERRLEVEL_LOW, mp); + if (curfdb != newfdb) + xfs_trans_brelse(tp, curbp); + return XFS_ERROR(EFSCORRUPTED); } + curfdb = newfdb; + if (be16_to_cpu(bests[fi]) >= length) + goto out; } } + /* Didn't find any space */ + fi = -1; +out: + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); + if (curbp) { + /* Giving back a free block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = fi; + state->extrablk.blkno = curfdb; + + /* + * Important: this magic number is not in the buffer - it's for + * buffer type information and therefore only the free/data type + * matters here, not whether CRCs are enabled or not. + */ + state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + } else { + state->extravalid = 0; + } /* - * Didn't find a match. - * If we are holding a buffer, give it back in case our caller - * finds it useful. + * Return the index, that will be the insertion point. */ - if ((state->extravalid = (curbp != NULL))) { - state->extrablk.bp = curbp; - state->extrablk.index = -1; + *indexp = index; + return XFS_ERROR(ENOENT); +} + +/* + * Look up a leaf entry in a node-format leaf block. + * The extrablk in state a data block. + */ +STATIC int +xfs_dir2_leafn_lookup_for_entry( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + struct xfs_buf *curbp = NULL; /* current data/free buffer */ + xfs_dir2_db_t curdb = -1; /* current data block number */ + xfs_dir2_data_entry_t *dep; /* data block entry */ + xfs_inode_t *dp; /* incore directory inode */ + int error; /* error return value */ + int index; /* leaf entry index */ + xfs_dir2_leaf_t *leaf; /* leaf structure */ + xfs_dir2_leaf_entry_t *lep; /* leaf entry */ + xfs_mount_t *mp; /* filesystem mount point */ + xfs_dir2_db_t newdb; /* new data block number */ + xfs_trans_t *tp; /* transaction pointer */ + enum xfs_dacmp cmp; /* comparison result */ + struct xfs_dir2_leaf_entry *ents; + struct xfs_dir3_icleaf_hdr leafhdr; + + dp = args->dp; + tp = args->trans; + mp = dp->i_mount; + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + + xfs_dir3_leaf_check(dp, bp); + ASSERT(leafhdr.count > 0); + + /* + * Look up the hash value in the leaf entries. + */ + index = xfs_dir2_leaf_search_hash(args, bp); + /* + * Do we have a buffer coming in? + */ + if (state->extravalid) { + curbp = state->extrablk.bp; + curdb = state->extrablk.blkno; + } + /* + * Loop over leaf entries with the right hash value. + */ + for (lep = &ents[index]; + index < leafhdr.count && be32_to_cpu(lep->hashval) == args->hashval; + lep++, index++) { + /* + * Skip stale leaf entries. + */ + if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR) + continue; /* - * For addname, giving back a free block. + * Pull the data block number from the entry. */ - if (args->addname) { - state->extrablk.blkno = curfdb; - state->extrablk.magic = XFS_DIR2_FREE_MAGIC; + newdb = xfs_dir2_dataptr_to_db(args->geo, + be32_to_cpu(lep->address)); + /* + * Not adding a new entry, so we really want to find + * the name given to us. + * + * If it's a different data block, go get it. + */ + if (newdb != curdb) { + /* + * If we had a block before that we aren't saving + * for a CI name, drop it + */ + if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT || + curdb != state->extrablk.blkno)) + xfs_trans_brelse(tp, curbp); + /* + * If needing the block that is saved with a CI match, + * use it otherwise read in the new data block. + */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + newdb == state->extrablk.blkno) { + ASSERT(state->extravalid); + curbp = state->extrablk.bp; + } else { + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, + newdb), + -1, &curbp); + if (error) + return error; + } + xfs_dir3_data_check(dp, curbp); + curdb = newdb; } /* - * For other callers, giving back a data block. + * Point to the data entry. */ - else { + dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); + /* + * Compare the entry and if it's an exact match, return + * EEXIST immediately. If it's the first case-insensitive + * match, store the block & inode number and continue looking. + */ + cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + /* If there is a CI match block, drop it */ + if (args->cmpresult != XFS_CMP_DIFFERENT && + curdb != state->extrablk.blkno) + xfs_trans_brelse(tp, state->extrablk.bp); + args->cmpresult = cmp; + args->inumber = be64_to_cpu(dep->inumber); + args->filetype = dp->d_ops->data_get_ftype(dep); + *indexp = index; + state->extravalid = 1; + state->extrablk.bp = curbp; state->extrablk.blkno = curdb; + state->extrablk.index = (int)((char *)dep - + (char *)curbp->b_addr); state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); } } - /* - * Return the final index, that will be the insertion point. - */ + ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); + if (curbp) { + if (args->cmpresult == XFS_CMP_DIFFERENT) { + /* Giving back last used data block. */ + state->extravalid = 1; + state->extrablk.bp = curbp; + state->extrablk.index = -1; + state->extrablk.blkno = curdb; + state->extrablk.magic = XFS_DIR2_DATA_MAGIC; + curbp->b_ops = &xfs_dir3_data_buf_ops; + xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); + } else { + /* If the curbp is not the CI match block, drop it */ + if (state->extrablk.bp != curbp) + xfs_trans_brelse(tp, curbp); + } + } else { + state->extravalid = 0; + } *indexp = index; - ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent); return XFS_ERROR(ENOENT); } /* + * Look up a leaf entry in a node-format leaf block. + * If this is an addname then the extrablk in state is a freespace block, + * otherwise it's a data block. + */ +int +xfs_dir2_leafn_lookup_int( + struct xfs_buf *bp, /* leaf buffer */ + xfs_da_args_t *args, /* operation arguments */ + int *indexp, /* out: leaf entry index */ + xfs_da_state_t *state) /* state to fill in */ +{ + if (args->op_flags & XFS_DA_OP_ADDNAME) + return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp, + state); + return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state); +} + +/* * Move count leaf entries from source to destination leaf. * Log entries and headers. Stale entries are preserved. */ static void -xfs_dir2_leafn_moveents( - xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp_s, /* source leaf buffer */ - int start_s, /* source leaf index */ - xfs_dabuf_t *bp_d, /* destination leaf buffer */ - int start_d, /* destination leaf index */ - int count) /* count of leaves to copy */ +xfs_dir3_leafn_moveents( + xfs_da_args_t *args, /* operation arguments */ + struct xfs_buf *bp_s, /* source */ + struct xfs_dir3_icleaf_hdr *shdr, + struct xfs_dir2_leaf_entry *sents, + int start_s,/* source leaf index */ + struct xfs_buf *bp_d, /* destination */ + struct xfs_dir3_icleaf_hdr *dhdr, + struct xfs_dir2_leaf_entry *dents, + int start_d,/* destination leaf index */ + int count) /* count of leaves to copy */ { - xfs_dir2_leaf_t *leaf_d; /* destination leaf structure */ - xfs_dir2_leaf_t *leaf_s; /* source leaf structure */ - int stale; /* count stale leaves copied */ - xfs_trans_t *tp; /* transaction pointer */ + int stale; /* count stale leaves copied */ + + trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count); - xfs_dir2_trace_args_bibii("leafn_moveents", args, bp_s, start_s, bp_d, - start_d, count); /* * Silently return if nothing to do. */ - if (count == 0) { + if (count == 0) return; - } - tp = args->trans; - leaf_s = bp_s->data; - leaf_d = bp_d->data; + /* * If the destination index is not the end of the current * destination leaf entries, open up a hole in the destination * to hold the new entries. */ - if (start_d < be16_to_cpu(leaf_d->hdr.count)) { - memmove(&leaf_d->ents[start_d + count], &leaf_d->ents[start_d], - (be16_to_cpu(leaf_d->hdr.count) - start_d) * - sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d + count, - count + be16_to_cpu(leaf_d->hdr.count) - 1); + if (start_d < dhdr->count) { + memmove(&dents[start_d + count], &dents[start_d], + (dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t)); + xfs_dir3_leaf_log_ents(args, bp_d, start_d + count, + count + dhdr->count - 1); } /* * If the source has stale leaves, count the ones in the copy range * so we can update the header correctly. */ - if (leaf_s->hdr.stale) { + if (shdr->stale) { int i; /* temp leaf index */ for (i = start_s, stale = 0; i < start_s + count; i++) { - if (be32_to_cpu(leaf_s->ents[i].address) == XFS_DIR2_NULL_DATAPTR) + if (sents[i].address == + cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; } } else @@ -681,29 +888,27 @@ xfs_dir2_leafn_moveents( /* * Copy the leaf entries from source to destination. */ - memcpy(&leaf_d->ents[start_d], &leaf_s->ents[start_s], + memcpy(&dents[start_d], &sents[start_s], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1); + xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1); + /* * If there are source entries after the ones we copied, * delete the ones we copied by sliding the next ones down. */ - if (start_s + count < be16_to_cpu(leaf_s->hdr.count)) { - memmove(&leaf_s->ents[start_s], &leaf_s->ents[start_s + count], + if (start_s + count < shdr->count) { + memmove(&sents[start_s], &sents[start_s + count], count * sizeof(xfs_dir2_leaf_entry_t)); - xfs_dir2_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1); + xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1); } + /* * Update the headers and log them. */ - be16_add(&leaf_s->hdr.count, -(count)); - be16_add(&leaf_s->hdr.stale, -(stale)); - be16_add(&leaf_d->hdr.count, count); - be16_add(&leaf_d->hdr.stale, stale); - xfs_dir2_leaf_log_header(tp, bp_s); - xfs_dir2_leaf_log_header(tp, bp_d); - xfs_dir2_leafn_check(args->dp, bp_s); - xfs_dir2_leafn_check(args->dp, bp_d); + shdr->count -= count; + shdr->stale -= stale; + dhdr->count += count; + dhdr->stale += stale; } /* @@ -712,21 +917,26 @@ xfs_dir2_leafn_moveents( */ int /* sort order */ xfs_dir2_leafn_order( - xfs_dabuf_t *leaf1_bp, /* leaf1 buffer */ - xfs_dabuf_t *leaf2_bp) /* leaf2 buffer */ + struct xfs_inode *dp, + struct xfs_buf *leaf1_bp, /* leaf1 buffer */ + struct xfs_buf *leaf2_bp) /* leaf2 buffer */ { - xfs_dir2_leaf_t *leaf1; /* leaf1 structure */ - xfs_dir2_leaf_t *leaf2; /* leaf2 structure */ - - leaf1 = leaf1_bp->data; - leaf2 = leaf2_bp->data; - ASSERT(be16_to_cpu(leaf1->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(leaf2->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - if (be16_to_cpu(leaf1->hdr.count) > 0 && - be16_to_cpu(leaf2->hdr.count) > 0 && - (be32_to_cpu(leaf2->ents[0].hashval) < be32_to_cpu(leaf1->ents[0].hashval) || - be32_to_cpu(leaf2->ents[be16_to_cpu(leaf2->hdr.count) - 1].hashval) < - be32_to_cpu(leaf1->ents[be16_to_cpu(leaf1->hdr.count) - 1].hashval))) + struct xfs_dir2_leaf *leaf1 = leaf1_bp->b_addr; + struct xfs_dir2_leaf *leaf2 = leaf2_bp->b_addr; + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + if (hdr1.count > 0 && hdr2.count > 0 && + (be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) || + be32_to_cpu(ents2[hdr2.count - 1].hashval) < + be32_to_cpu(ents1[hdr1.count - 1].hashval))) return 1; return 0; } @@ -750,30 +960,41 @@ xfs_dir2_leafn_rebalance( xfs_dir2_leaf_t *leaf1; /* first leaf structure */ xfs_dir2_leaf_t *leaf2; /* second leaf structure */ int mid; /* midpoint leaf index */ -#ifdef DEBUG +#if defined(DEBUG) || defined(XFS_WARN) int oldstale; /* old count of stale leaves */ #endif int oldsum; /* old total leaf count */ int swap; /* swapped leaf blocks */ + struct xfs_dir2_leaf_entry *ents1; + struct xfs_dir2_leaf_entry *ents2; + struct xfs_dir3_icleaf_hdr hdr1; + struct xfs_dir3_icleaf_hdr hdr2; + struct xfs_inode *dp = state->args->dp; args = state->args; /* * If the block order is wrong, swap the arguments. */ - if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) { + if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) { xfs_da_state_blk_t *tmp; /* temp for block swap */ tmp = blk1; blk1 = blk2; blk2 = tmp; } - leaf1 = blk1->bp->data; - leaf2 = blk2->bp->data; - oldsum = be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count); -#ifdef DEBUG - oldstale = be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale); + leaf1 = blk1->bp->b_addr; + leaf2 = blk2->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1); + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2); + ents1 = dp->d_ops->leaf_ents_p(leaf1); + ents2 = dp->d_ops->leaf_ents_p(leaf2); + + oldsum = hdr1.count + hdr2.count; +#if defined(DEBUG) || defined(XFS_WARN) + oldstale = hdr1.stale + hdr2.stale; #endif mid = oldsum >> 1; + /* * If the old leaf count was odd then the new one will be even, * so we need to divide the new count evenly. @@ -781,10 +1002,10 @@ xfs_dir2_leafn_rebalance( if (oldsum & 1) { xfs_dahash_t midhash; /* middle entry hash value */ - if (mid >= be16_to_cpu(leaf1->hdr.count)) - midhash = be32_to_cpu(leaf2->ents[mid - be16_to_cpu(leaf1->hdr.count)].hashval); + if (mid >= hdr1.count) + midhash = be32_to_cpu(ents2[mid - hdr1.count].hashval); else - midhash = be32_to_cpu(leaf1->ents[mid].hashval); + midhash = be32_to_cpu(ents1[mid].hashval); isleft = args->hashval <= midhash; } /* @@ -798,44 +1019,135 @@ xfs_dir2_leafn_rebalance( * Calculate moved entry count. Positive means left-to-right, * negative means right-to-left. Then move the entries. */ - count = be16_to_cpu(leaf1->hdr.count) - mid + (isleft == 0); + count = hdr1.count - mid + (isleft == 0); if (count > 0) - xfs_dir2_leafn_moveents(args, blk1->bp, - be16_to_cpu(leaf1->hdr.count) - count, blk2->bp, 0, count); + xfs_dir3_leafn_moveents(args, blk1->bp, &hdr1, ents1, + hdr1.count - count, blk2->bp, + &hdr2, ents2, 0, count); else if (count < 0) - xfs_dir2_leafn_moveents(args, blk2->bp, 0, blk1->bp, - be16_to_cpu(leaf1->hdr.count), count); - ASSERT(be16_to_cpu(leaf1->hdr.count) + be16_to_cpu(leaf2->hdr.count) == oldsum); - ASSERT(be16_to_cpu(leaf1->hdr.stale) + be16_to_cpu(leaf2->hdr.stale) == oldstale); + xfs_dir3_leafn_moveents(args, blk2->bp, &hdr2, ents2, 0, + blk1->bp, &hdr1, ents1, + hdr1.count, count); + + ASSERT(hdr1.count + hdr2.count == oldsum); + ASSERT(hdr1.stale + hdr2.stale == oldstale); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1); + dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2); + xfs_dir3_leaf_log_header(args, blk1->bp); + xfs_dir3_leaf_log_header(args, blk2->bp); + + xfs_dir3_leaf_check(dp, blk1->bp); + xfs_dir3_leaf_check(dp, blk2->bp); + /* * Mark whether we're inserting into the old or new leaf. */ - if (be16_to_cpu(leaf1->hdr.count) < be16_to_cpu(leaf2->hdr.count)) + if (hdr1.count < hdr2.count) state->inleaf = swap; - else if (be16_to_cpu(leaf1->hdr.count) > be16_to_cpu(leaf2->hdr.count)) + else if (hdr1.count > hdr2.count) state->inleaf = !swap; else - state->inleaf = - swap ^ (blk1->index <= be16_to_cpu(leaf1->hdr.count)); + state->inleaf = swap ^ (blk1->index <= hdr1.count); /* * Adjust the expected index for insertion. */ if (!state->inleaf) - blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count); - - /* - * Finally sanity check just to make sure we are not returning a negative index + blk2->index = blk1->index - hdr1.count; + + /* + * Finally sanity check just to make sure we are not returning a + * negative index */ - if(blk2->index < 0) { + if (blk2->index < 0) { state->inleaf = 1; blk2->index = 0; - cmn_err(CE_ALERT, - "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " - "blk1->index %d\n", - blk1->index); + xfs_alert(dp->i_mount, + "%s: picked the wrong leaf? reverting original leaf: blk1->index %d", + __func__, blk1->index); } } +static int +xfs_dir3_data_block_free( + xfs_da_args_t *args, + struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_free *free, + xfs_dir2_db_t fdb, + int findex, + struct xfs_buf *fbp, + int longest) +{ + int logfree = 0; + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_inode *dp = args->dp; + + dp->d_ops->free_hdr_from_disk(&freehdr, free); + bests = dp->d_ops->free_bests_p(free); + if (hdr) { + /* + * Data block is not empty, just set the free entry to the new + * value. + */ + bests[findex] = cpu_to_be16(longest); + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; + } + + /* One less used entry in the free table. */ + freehdr.nused--; + + /* + * If this was the last entry in the table, we can trim the table size + * back. There might be other entries at the end referring to + * non-existent data blocks, get those too. + */ + if (findex == freehdr.nvalid - 1) { + int i; /* free entry index */ + + for (i = findex - 1; i >= 0; i--) { + if (bests[i] != cpu_to_be16(NULLDATAOFF)) + break; + } + freehdr.nvalid = i + 1; + logfree = 0; + } else { + /* Not the last entry, just punch it out. */ + bests[findex] = cpu_to_be16(NULLDATAOFF); + logfree = 1; + } + + dp->d_ops->free_hdr_to_disk(free, &freehdr); + xfs_dir2_free_log_header(args, fbp); + + /* + * If there are no useful entries left in the block, get rid of the + * block if we can. + */ + if (!freehdr.nused) { + int error; + + error = xfs_dir2_shrink_inode(args, fdb, fbp); + if (error == 0) { + fbp = NULL; + logfree = 0; + } else if (error != ENOSPC || args->total != 0) + return error; + /* + * It's possible to get ENOSPC if there is no + * space reservation. In this case some one + * else will eventually get rid of this block. + */ + } + + /* Log the free entry that changed, unless we got rid of it. */ + if (logfree) + xfs_dir2_free_log_bests(args, fbp, findex, findex); + return 0; +} + /* * Remove an entry from a node directory. * This removes the leaf entry and the data entry, @@ -844,14 +1156,14 @@ xfs_dir2_leafn_rebalance( static int /* error */ xfs_dir2_leafn_remove( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* leaf buffer */ + struct xfs_buf *bp, /* leaf buffer */ int index, /* leaf entry index */ xfs_da_state_blk_t *dblk, /* data block */ int *rval) /* resulting block needs join */ { - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t db; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data block entry */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_leaf_t *leaf; /* leaf structure */ @@ -862,186 +1174,140 @@ xfs_dir2_leafn_remove( int needlog; /* need to log data header */ int needscan; /* need to rescan data frees */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir2_data_free *bf; /* bestfree table */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + + trace_xfs_dir2_leafn_remove(args, index); - xfs_dir2_trace_args_sb("leafn_remove", args, index, bp); dp = args->dp; tp = args->trans; mp = dp->i_mount; - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + /* * Point to the entry we're removing. */ - lep = &leaf->ents[index]; + lep = &ents[index]; + /* * Extract the data block and offset from the entry. */ - db = XFS_DIR2_DATAPTR_TO_DB(mp, be32_to_cpu(lep->address)); + db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->blkno == db); - off = XFS_DIR2_DATAPTR_TO_OFF(mp, be32_to_cpu(lep->address)); + off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)); ASSERT(dblk->index == off); + /* * Kill the leaf entry by marking it stale. * Log the leaf block changes. */ - be16_add(&leaf->hdr.stale, 1); - xfs_dir2_leaf_log_header(tp, bp); + leafhdr.stale++; + dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr); + xfs_dir3_leaf_log_header(args, bp); + lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR); - xfs_dir2_leaf_log_ents(tp, bp, index, index); + xfs_dir3_leaf_log_ents(args, bp, index, index); + /* * Make the data entry free. Keep track of the longest freespace * in the data block in case it changes. */ dbp = dblk->bp; - data = dbp->data; - dep = (xfs_dir2_data_entry_t *)((char *)data + off); - longest = be16_to_cpu(data->hdr.bestfree[0].length); + hdr = dbp->b_addr; + dep = (xfs_dir2_data_entry_t *)((char *)hdr + off); + bf = dp->d_ops->data_bestfree_p(hdr); + longest = be16_to_cpu(bf[0].length); needlog = needscan = 0; - xfs_dir2_data_make_free(tp, dbp, off, - XFS_DIR2_DATA_ENTSIZE(dep->namelen), &needlog, &needscan); + xfs_dir2_data_make_free(args, dbp, off, + dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan); /* * Rescan the data block freespaces for bestfree. * Log the data block header if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); if (needlog) - xfs_dir2_data_log_header(tp, dbp); - xfs_dir2_data_check(dp, dbp); + xfs_dir2_data_log_header(args, dbp); + xfs_dir3_data_check(dp, dbp); /* * If the longest data block freespace changes, need to update * the corresponding freeblock entry. */ - if (longest < be16_to_cpu(data->hdr.bestfree[0].length)) { + if (longest < be16_to_cpu(bf[0].length)) { int error; /* error return value */ - xfs_dabuf_t *fbp; /* freeblock buffer */ + struct xfs_buf *fbp; /* freeblock buffer */ xfs_dir2_db_t fdb; /* freeblock block number */ int findex; /* index in freeblock entries */ xfs_dir2_free_t *free; /* freeblock structure */ - int logfree; /* need to log free entry */ /* * Convert the data block number to a free block, * read in the free block. */ - fdb = XFS_DIR2_DB_TO_FDB(mp, db); - if ((error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, fdb), - -1, &fbp, XFS_DATA_FORK))) { + fdb = dp->d_ops->db_to_fdb(args->geo, db); + error = xfs_dir2_free_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fdb), + &fbp); + if (error) return error; - } - free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); - ASSERT(be32_to_cpu(free->hdr.firstdb) == - XFS_DIR2_MAX_FREE_BESTS(mp) * - (fdb - XFS_DIR2_FREE_FIRSTDB(mp))); + free = fbp->b_addr; +#ifdef DEBUG + { + struct xfs_dir3_icfree_hdr freehdr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) * + (fdb - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET))); + } +#endif /* * Calculate which entry we need to fix. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, db); - longest = be16_to_cpu(data->hdr.bestfree[0].length); + findex = dp->d_ops->db_to_fdindex(args->geo, db); + longest = be16_to_cpu(bf[0].length); /* * If the data block is now empty we can get rid of it * (usually). */ - if (longest == mp->m_dirblksize - (uint)sizeof(data->hdr)) { + if (longest == args->geo->blksize - + dp->d_ops->data_entry_offset) { /* * Try to punch out the data block. */ error = xfs_dir2_shrink_inode(args, db, dbp); if (error == 0) { dblk->bp = NULL; - data = NULL; + hdr = NULL; } /* * We can get ENOSPC if there's no space reservation. * In this case just drop the buffer and some one else * will eventually get rid of the empty block. */ - else if (error == ENOSPC && args->total == 0) - xfs_da_buf_done(dbp); - else + else if (!(error == ENOSPC && args->total == 0)) return error; } /* * If we got rid of the data block, we can eliminate that entry * in the free block. */ - if (data == NULL) { - /* - * One less used entry in the free table. - */ - be32_add(&free->hdr.nused, -1); - xfs_dir2_free_log_header(tp, fbp); - /* - * If this was the last entry in the table, we can - * trim the table size back. There might be other - * entries at the end referring to non-existent - * data blocks, get those too. - */ - if (findex == be32_to_cpu(free->hdr.nvalid) - 1) { - int i; /* free entry index */ - - for (i = findex - 1; - i >= 0 && be16_to_cpu(free->bests[i]) == NULLDATAOFF; - i--) - continue; - free->hdr.nvalid = cpu_to_be32(i + 1); - logfree = 0; - } - /* - * Not the last entry, just punch it out. - */ - else { - free->bests[findex] = cpu_to_be16(NULLDATAOFF); - logfree = 1; - } - /* - * If there are no useful entries left in the block, - * get rid of the block if we can. - */ - if (!free->hdr.nused) { - error = xfs_dir2_shrink_inode(args, fdb, fbp); - if (error == 0) { - fbp = NULL; - logfree = 0; - } else if (error != ENOSPC || args->total != 0) - return error; - /* - * It's possible to get ENOSPC if there is no - * space reservation. In this case some one - * else will eventually get rid of this block. - */ - } - } - /* - * Data block is not empty, just set the free entry to - * the new value. - */ - else { - free->bests[findex] = cpu_to_be16(longest); - logfree = 1; - } - /* - * Log the free entry that changed, unless we got rid of it. - */ - if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * Drop the buffer if we still have it. - */ - if (fbp) - xfs_da_buf_done(fbp); + error = xfs_dir3_data_block_free(args, hdr, free, + fdb, findex, fbp, longest); + if (error) + return error; } - xfs_dir2_leafn_check(dp, bp); + + xfs_dir3_leaf_check(dp, bp); /* - * Return indication of whether this leaf block is emtpy enough + * Return indication of whether this leaf block is empty enough * to justify trying to join it with a neighbor. */ - *rval = - ((uint)sizeof(leaf->hdr) + - (uint)sizeof(leaf->ents[0]) * - (be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale))) < - mp->m_dir_magicpct; + *rval = (dp->d_ops->leaf_hdr_size + + (uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) < + args->geo->magicpct; return 0; } @@ -1058,13 +1324,14 @@ xfs_dir2_leafn_split( xfs_dablk_t blkno; /* new leaf block number */ int error; /* error return value */ xfs_mount_t *mp; /* filesystem mount point */ + struct xfs_inode *dp; /* * Allocate space for a new leaf node. */ args = state->args; - mp = args->dp->i_mount; - ASSERT(args != NULL); + dp = args->dp; + mp = dp->i_mount; ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC); error = xfs_da_grow_inode(args, &blkno); if (error) { @@ -1073,11 +1340,11 @@ xfs_dir2_leafn_split( /* * Initialize the new leaf block. */ - error = xfs_dir2_leaf_init(args, XFS_DIR2_DA_TO_DB(mp, blkno), - &newblk->bp, XFS_DIR2_LEAFN_MAGIC); - if (error) { + error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno), + &newblk->bp, XFS_DIR2_LEAFN_MAGIC); + if (error) return error; - } + newblk->blkno = blkno; newblk->magic = XFS_DIR2_LEAFN_MAGIC; /* @@ -1085,7 +1352,7 @@ xfs_dir2_leafn_split( * block into the leaves. */ xfs_dir2_leafn_rebalance(state, oldblk, newblk); - error = xfs_da_blk_link(state, oldblk, newblk); + error = xfs_da3_blk_link(state, oldblk, newblk); if (error) { return error; } @@ -1099,10 +1366,10 @@ xfs_dir2_leafn_split( /* * Update last hashval in each block since we added the name. */ - oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL); - newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL); - xfs_dir2_leafn_check(args->dp, oldblk->bp); - xfs_dir2_leafn_check(args->dp, newblk->bp); + oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL); + newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL); + xfs_dir3_leaf_check(dp, oldblk->bp); + xfs_dir3_leaf_check(dp, newblk->bp); return error; } @@ -1122,15 +1389,17 @@ xfs_dir2_leafn_toosmall( { xfs_da_state_blk_t *blk; /* leaf block */ xfs_dablk_t blkno; /* leaf block number */ - xfs_dabuf_t *bp; /* leaf buffer */ + struct xfs_buf *bp; /* leaf buffer */ int bytes; /* bytes in use */ int count; /* leaf live entry count */ int error; /* error return value */ int forward; /* sibling block direction */ int i; /* sibling counter */ - xfs_da_blkinfo_t *info; /* leaf block header */ xfs_dir2_leaf_t *leaf; /* leaf structure */ int rval; /* result from path_shift */ + struct xfs_dir3_icleaf_hdr leafhdr; + struct xfs_dir2_leaf_entry *ents; + struct xfs_inode *dp = state->args->dp; /* * Check for the degenerate case of the block being over 50% full. @@ -1138,12 +1407,14 @@ xfs_dir2_leafn_toosmall( * to coalesce with a sibling. */ blk = &state->path.blk[state->path.active - 1]; - info = blk->bp->data; - ASSERT(be16_to_cpu(info->magic) == XFS_DIR2_LEAFN_MAGIC); - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = (uint)sizeof(leaf->hdr) + count * (uint)sizeof(leaf->ents[0]); - if (bytes > (state->blocksize >> 1)) { + leaf = blk->bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + xfs_dir3_leaf_check(dp, blk->bp); + + count = leafhdr.count - leafhdr.stale; + bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]); + if (bytes > (state->args->geo->blksize >> 1)) { /* * Blk over 50%, don't try to join. */ @@ -1161,9 +1432,9 @@ xfs_dir2_leafn_toosmall( * Make altpath point to the block we want to keep and * path point to the block we want to drop (this one). */ - forward = (info->forw != 0); + forward = (leafhdr.forw != 0); memcpy(&state->altpath, &state->path, sizeof(state->path)); - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); if (error) return error; @@ -1177,36 +1448,40 @@ xfs_dir2_leafn_toosmall( * We prefer coalescing with the lower numbered sibling so as * to shrink a directory over time. */ - forward = be32_to_cpu(info->forw) < be32_to_cpu(info->back); + forward = leafhdr.forw < leafhdr.back; for (i = 0, bp = NULL; i < 2; forward = !forward, i++) { - blkno = forward ? be32_to_cpu(info->forw) : be32_to_cpu(info->back); + struct xfs_dir3_icleaf_hdr hdr2; + + blkno = forward ? leafhdr.forw : leafhdr.back; if (blkno == 0) continue; /* * Read the sibling leaf block. */ - if ((error = - xfs_da_read_buf(state->args->trans, state->args->dp, blkno, - -1, &bp, XFS_DATA_FORK))) { + error = xfs_dir3_leafn_read(state->args->trans, dp, + blkno, -1, &bp); + if (error) return error; - } - ASSERT(bp != NULL); + /* * Count bytes in the two blocks combined. */ - leaf = (xfs_dir2_leaf_t *)info; - count = be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes = state->blocksize - (state->blocksize >> 2); - leaf = bp->data; - ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - count += be16_to_cpu(leaf->hdr.count) - be16_to_cpu(leaf->hdr.stale); - bytes -= count * (uint)sizeof(leaf->ents[0]); + count = leafhdr.count - leafhdr.stale; + bytes = state->args->geo->blksize - + (state->args->geo->blksize >> 2); + + leaf = bp->b_addr; + dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf); + ents = dp->d_ops->leaf_ents_p(leaf); + count += hdr2.count - hdr2.stale; + bytes -= count * sizeof(ents[0]); + /* * Fits with at least 25% to spare. */ if (bytes >= 0) break; - xfs_da_brelse(state->args->trans, bp); + xfs_trans_brelse(state->args->trans, bp); } /* * Didn't like either block, give up. @@ -1215,21 +1490,17 @@ xfs_dir2_leafn_toosmall( *action = 0; return 0; } - /* - * Done with the sibling leaf block here, drop the dabuf - * so path_shift can get it. - */ - xfs_da_buf_done(bp); + /* * Make altpath point to the block we want to keep (the lower * numbered block) and path point to the block we want to drop. */ memcpy(&state->altpath, &state->path, sizeof(state->path)); if (blkno < blk->blkno) - error = xfs_da_path_shift(state, &state->altpath, forward, 0, + error = xfs_da3_path_shift(state, &state->altpath, forward, 0, &rval); else - error = xfs_da_path_shift(state, &state->path, forward, 0, + error = xfs_da3_path_shift(state, &state->path, forward, 0, &rval); if (error) { return error; @@ -1251,34 +1522,54 @@ xfs_dir2_leafn_unbalance( xfs_da_args_t *args; /* operation arguments */ xfs_dir2_leaf_t *drop_leaf; /* dead leaf structure */ xfs_dir2_leaf_t *save_leaf; /* surviving leaf structure */ + struct xfs_dir3_icleaf_hdr savehdr; + struct xfs_dir3_icleaf_hdr drophdr; + struct xfs_dir2_leaf_entry *sents; + struct xfs_dir2_leaf_entry *dents; + struct xfs_inode *dp = state->args->dp; args = state->args; ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(save_blk->magic == XFS_DIR2_LEAFN_MAGIC); - drop_leaf = drop_blk->bp->data; - save_leaf = save_blk->bp->data; - ASSERT(be16_to_cpu(drop_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); - ASSERT(be16_to_cpu(save_leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC); + drop_leaf = drop_blk->bp->b_addr; + save_leaf = save_blk->bp->b_addr; + + dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf); + dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf); + sents = dp->d_ops->leaf_ents_p(save_leaf); + dents = dp->d_ops->leaf_ents_p(drop_leaf); + /* * If there are any stale leaf entries, take this opportunity * to purge them. */ - if (drop_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, drop_blk->bp); - if (save_leaf->hdr.stale) - xfs_dir2_leaf_compact(args, save_blk->bp); + if (drophdr.stale) + xfs_dir3_leaf_compact(args, &drophdr, drop_blk->bp); + if (savehdr.stale) + xfs_dir3_leaf_compact(args, &savehdr, save_blk->bp); + /* * Move the entries from drop to the appropriate end of save. */ - drop_blk->hashval = be32_to_cpu(drop_leaf->ents[be16_to_cpu(drop_leaf->hdr.count) - 1].hashval); - if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp)) - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, 0, - be16_to_cpu(drop_leaf->hdr.count)); + drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval); + if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp)) + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, 0, + drophdr.count); else - xfs_dir2_leafn_moveents(args, drop_blk->bp, 0, save_blk->bp, - be16_to_cpu(save_leaf->hdr.count), be16_to_cpu(drop_leaf->hdr.count)); - save_blk->hashval = be32_to_cpu(save_leaf->ents[be16_to_cpu(save_leaf->hdr.count) - 1].hashval); - xfs_dir2_leafn_check(args->dp, save_blk->bp); + xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0, + save_blk->bp, &savehdr, sents, + savehdr.count, drophdr.count); + save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval); + + /* log the changes made when moving the entries */ + dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr); + dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr); + xfs_dir3_leaf_log_header(args, save_blk->bp); + xfs_dir3_leaf_log_header(args, drop_blk->bp); + + xfs_dir3_leaf_check(dp, save_blk->bp); + xfs_dir3_leaf_check(dp, drop_blk->bp); } /* @@ -1293,20 +1584,19 @@ xfs_dir2_node_addname( int rval; /* sub-return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_addname", args); + trace_xfs_dir2_node_addname(args); + /* * Allocate and initialize the state (btree cursor). */ state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Look up the name. We're not supposed to find it, but * this gives us the insertion point. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; if (rval != ENOENT) { @@ -1331,8 +1621,8 @@ xfs_dir2_node_addname( /* * It worked, fix the hash values up the btree. */ - if (!args->justcheck) - xfs_da_fixhashpath(state, &state->path); + if (!(args->op_flags & XFS_DA_OP_JUSTCHECK)) + xfs_da3_fixhashpath(state, &state->path); } else { /* * It didn't work, we need to split the leaf block. @@ -1344,7 +1634,7 @@ xfs_dir2_node_addname( /* * Split the leaf block and insert the new entry. */ - rval = xfs_da_split(state); + rval = xfs_da3_split(state); } done: xfs_da_state_free(state); @@ -1361,15 +1651,15 @@ xfs_dir2_node_addname_int( xfs_da_args_t *args, /* operation arguments */ xfs_da_state_blk_t *fblk) /* optional freespace block */ { - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_db_t dbno; /* data block number */ - xfs_dabuf_t *dbp; /* data block buffer */ + struct xfs_buf *dbp; /* data block buffer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ xfs_dir2_data_unused_t *dup; /* data unused entry pointer */ int error; /* error return value */ xfs_dir2_db_t fbno; /* freespace block number */ - xfs_dabuf_t *fbp; /* freespace buffer */ + struct xfs_buf *fbp; /* freespace buffer */ int findex; /* freespace entry index */ xfs_dir2_free_t *free=NULL; /* freespace block structure */ xfs_dir2_db_t ifbno; /* initial freespace block no */ @@ -1381,11 +1671,14 @@ xfs_dir2_node_addname_int( int needscan; /* need to rescan data frees */ __be16 *tagp; /* data entry tag pointer */ xfs_trans_t *tp; /* transaction pointer */ + __be16 *bests; + struct xfs_dir3_icfree_hdr freehdr; + struct xfs_dir2_data_free *bf; dp = args->dp; mp = dp->i_mount; tp = args->trans; - length = XFS_DIR2_DATA_ENTSIZE(args->namelen); + length = dp->d_ops->data_entsize(args->namelen); /* * If we came in with a freespace block that means that lookup * found an entry with our hash value. This is the freespace @@ -1397,37 +1690,38 @@ xfs_dir2_node_addname_int( * Remember initial freespace block number. */ ifbno = fblk->blkno; - free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + free = fbp->b_addr; findex = fblk->index; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + /* * This means the free entry showed that the data block had * space for our entry, so we remembered it. * Use that data block. */ if (findex >= 0) { - ASSERT(findex < be32_to_cpu(free->hdr.nvalid)); - ASSERT(be16_to_cpu(free->bests[findex]) != NULLDATAOFF); - ASSERT(be16_to_cpu(free->bests[findex]) >= length); - dbno = be32_to_cpu(free->hdr.firstdb) + findex; - } - /* - * The data block looked at didn't have enough room. - * We'll start at the beginning of the freespace entries. - */ - else { + ASSERT(findex < freehdr.nvalid); + ASSERT(be16_to_cpu(bests[findex]) != NULLDATAOFF); + ASSERT(be16_to_cpu(bests[findex]) >= length); + dbno = freehdr.firstdb + findex; + } else { + /* + * The data block looked at didn't have enough room. + * We'll start at the beginning of the freespace entries. + */ dbno = -1; findex = 0; } - } - /* - * Didn't come in with a freespace block, so don't have a data block. - */ - else { + } else { + /* + * Didn't come in with a freespace block, so no data block. + */ ifbno = dbno = -1; fbp = NULL; findex = 0; } + /* * If we don't have a data block yet, we're going to scan the * freespace blocks looking for one. Figure out what the @@ -1436,9 +1730,9 @@ xfs_dir2_node_addname_int( if (dbno == -1) { xfs_fileoff_t fo; /* freespace block number */ - if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) + if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) return error; - lastfbno = XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo); + lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo); fbno = ifbno; } /* @@ -1456,7 +1750,8 @@ xfs_dir2_node_addname_int( * us a freespace block to start with. */ if (++fbno == 0) - fbno = XFS_DIR2_FREE_FIRSTDB(mp); + fbno = xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET); /* * If it's ifbno we already looked at it. */ @@ -1473,33 +1768,38 @@ xfs_dir2_node_addname_int( * This should be really rare, so there's no reason * to avoid it. */ - if ((error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) return error; - } - if (unlikely(fbp == NULL)) { + if (!fbp) continue; - } - free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + free = fbp->b_addr; findex = 0; } /* * Look at the current free entry. Is it good enough? + * + * The bests initialisation should be where the bufer is read in + * the above branch. But gcc is too stupid to realise that bests + * and the freehdr are actually initialised if they are placed + * there, so we have to do it here to avoid warnings. Blech. */ - if (be16_to_cpu(free->bests[findex]) != NULLDATAOFF && - be16_to_cpu(free->bests[findex]) >= length) - dbno = be32_to_cpu(free->hdr.firstdb) + findex; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); + if (be16_to_cpu(bests[findex]) != NULLDATAOFF && + be16_to_cpu(bests[findex]) >= length) + dbno = freehdr.firstdb + findex; else { /* * Are we done with the freeblock? */ - if (++findex == be32_to_cpu(free->hdr.nvalid)) { + if (++findex == freehdr.nvalid) { /* * Drop the block. */ - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); fbp = NULL; if (fblk && fblk->bp) fblk->bp = NULL; @@ -1514,35 +1814,23 @@ xfs_dir2_node_addname_int( /* * Not allowed to allocate, return failure. */ - if (args->justcheck || args->total == 0) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); - } + /* * Allocate and initialize the new data block. */ if (unlikely((error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &dbno)) || - (error = xfs_dir2_data_init(args, dbno, &dbp)))) { - /* - * Drop the freespace buffer unless it came from our - * caller. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + (error = xfs_dir3_data_init(args, dbno, &dbp)))) return error; - } + /* * If (somehow) we have a freespace block, get rid of it. */ if (fbp) - xfs_da_brelse(tp, fbp); + xfs_trans_brelse(tp, fbp); if (fblk && fblk->bp) fblk->bp = NULL; @@ -1550,44 +1838,41 @@ xfs_dir2_node_addname_int( * Get the freespace block corresponding to the data block * that was just allocated. */ - fbno = XFS_DIR2_DB_TO_FDB(mp, dbno); - if (unlikely(error = xfs_da_read_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), -2, &fbp, - XFS_DATA_FORK))) { - xfs_da_buf_done(dbp); + fbno = dp->d_ops->db_to_fdb(args->geo, dbno); + error = xfs_dir2_free_try_read(tp, dp, + xfs_dir2_db_to_da(args->geo, fbno), + &fbp); + if (error) return error; - } + /* * If there wasn't a freespace block, the read will * return a NULL fbp. Allocate and initialize a new one. */ - if( fbp == NULL ) { - if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, - &fbno))) { + if (!fbp) { + error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, + &fbno); + if (error) return error; - } - if (unlikely(XFS_DIR2_DB_TO_FDB(mp, dbno) != fbno)) { - cmn_err(CE_ALERT, - "xfs_dir2_node_addname_int: dir ino " - "%llu needed freesp block %lld for\n" - " data block %lld, got %lld\n" - " ifbno %llu lastfbno %d\n", - (unsigned long long)dp->i_ino, - (long long)XFS_DIR2_DB_TO_FDB(mp, dbno), + if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) { + xfs_alert(mp, + "%s: dir ino %llu needed freesp block %lld for\n" + " data block %lld, got %lld ifbno %llu lastfbno %d", + __func__, (unsigned long long)dp->i_ino, + (long long)dp->d_ops->db_to_fdb( + args->geo, dbno), (long long)dbno, (long long)fbno, (unsigned long long)ifbno, lastfbno); if (fblk) { - cmn_err(CE_ALERT, - " fblk 0x%p blkno %llu " - "index %d magic 0x%x\n", + xfs_alert(mp, + " fblk 0x%p blkno %llu index %d magic 0x%x", fblk, (unsigned long long)fblk->blkno, fblk->index, fblk->magic); } else { - cmn_err(CE_ALERT, - " ... fblk is NULL\n"); + xfs_alert(mp, " ... fblk is NULL"); } XFS_ERROR_REPORT("xfs_dir2_node_addname_int", XFS_ERRLEVEL_LOW, mp); @@ -1597,60 +1882,59 @@ xfs_dir2_node_addname_int( /* * Get a buffer for the new block. */ - if ((error = xfs_da_get_buf(tp, dp, - XFS_DIR2_DB_TO_DA(mp, fbno), - -1, &fbp, XFS_DATA_FORK))) { + error = xfs_dir3_free_get_buf(args, fbno, &fbp); + if (error) return error; - } - ASSERT(fbp != NULL); + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); /* - * Initialize the new block to be empty, and remember - * its first slot as our empty slot. + * Remember the first slot as our empty slot. */ - free = fbp->data; - free->hdr.magic = cpu_to_be32(XFS_DIR2_FREE_MAGIC); - free->hdr.firstdb = cpu_to_be32( - (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * - XFS_DIR2_MAX_FREE_BESTS(mp)); - free->hdr.nvalid = 0; - free->hdr.nused = 0; + freehdr.firstdb = + (fbno - xfs_dir2_byte_to_db(args->geo, + XFS_DIR2_FREE_OFFSET)) * + dp->d_ops->free_max_bests(args->geo); } else { - free = fbp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + free = fbp->b_addr; + bests = dp->d_ops->free_bests_p(free); + dp->d_ops->free_hdr_from_disk(&freehdr, free); } /* * Set the freespace block index from the data block number. */ - findex = XFS_DIR2_DB_TO_FDINDEX(mp, dbno); + findex = dp->d_ops->db_to_fdindex(args->geo, dbno); /* * If it's after the end of the current entries in the * freespace block, extend that table. */ - if (findex >= be32_to_cpu(free->hdr.nvalid)) { - ASSERT(findex < XFS_DIR2_MAX_FREE_BESTS(mp)); - free->hdr.nvalid = cpu_to_be32(findex + 1); + if (findex >= freehdr.nvalid) { + ASSERT(findex < dp->d_ops->free_max_bests(args->geo)); + freehdr.nvalid = findex + 1; /* * Tag new entry so nused will go up. */ - free->bests[findex] = cpu_to_be16(NULLDATAOFF); + bests[findex] = cpu_to_be16(NULLDATAOFF); } /* * If this entry was for an empty data block * (this should always be true) then update the header. */ - if (be16_to_cpu(free->bests[findex]) == NULLDATAOFF) { - be32_add(&free->hdr.nused, 1); - xfs_dir2_free_log_header(tp, fbp); + if (bests[findex] == cpu_to_be16(NULLDATAOFF)) { + freehdr.nused++; + dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr); + xfs_dir2_free_log_header(args, fbp); } /* * Update the real value in the table. * We haven't allocated the data entry yet so this will * change again. */ - data = dbp->data; - free->bests[findex] = data->hdr.bestfree[0].length; + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); + bests[findex] = bf[0].length; logfree = 1; } /* @@ -1660,36 +1944,33 @@ xfs_dir2_node_addname_int( /* * If just checking, we succeeded. */ - if (args->justcheck) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; - } + /* * Read the data block in. */ - if (unlikely( - error = xfs_da_read_buf(tp, dp, XFS_DIR2_DB_TO_DA(mp, dbno), - -1, &dbp, XFS_DATA_FORK))) { - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + error = xfs_dir3_data_read(tp, dp, + xfs_dir2_db_to_da(args->geo, dbno), + -1, &dbp); + if (error) return error; - } - data = dbp->data; + hdr = dbp->b_addr; + bf = dp->d_ops->data_bestfree_p(hdr); logfree = 0; } - ASSERT(be16_to_cpu(data->hdr.bestfree[0].length) >= length); + ASSERT(be16_to_cpu(bf[0].length) >= length); /* * Point to the existing unused space. */ dup = (xfs_dir2_data_unused_t *) - ((char *)data + be16_to_cpu(data->hdr.bestfree[0].offset)); + ((char *)hdr + be16_to_cpu(bf[0].offset)); needscan = needlog = 0; /* * Mark the first part of the unused space, inuse for us. */ - xfs_dir2_data_use_free(tp, dbp, dup, - (xfs_dir2_data_aoff_t)((char *)dup - (char *)data), length, + xfs_dir2_data_use_free(args, dbp, dup, + (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length, &needlog, &needscan); /* * Fill in the new entry and log it. @@ -1698,48 +1979,44 @@ xfs_dir2_node_addname_int( dep->inumber = cpu_to_be64(args->inumber); dep->namelen = args->namelen; memcpy(dep->name, args->name, dep->namelen); - tagp = XFS_DIR2_DATA_ENTRY_TAG_P(dep); - *tagp = cpu_to_be16((char *)dep - (char *)data); - xfs_dir2_data_log_entry(tp, dbp, dep); + dp->d_ops->data_put_ftype(dep, args->filetype); + tagp = dp->d_ops->data_entry_tag_p(dep); + *tagp = cpu_to_be16((char *)dep - (char *)hdr); + xfs_dir2_data_log_entry(args, dbp, dep); /* * Rescan the block for bestfree if needed. */ if (needscan) - xfs_dir2_data_freescan(mp, data, &needlog, NULL); + xfs_dir2_data_freescan(dp, hdr, &needlog); /* * Log the data block header if needed. */ if (needlog) - xfs_dir2_data_log_header(tp, dbp); + xfs_dir2_data_log_header(args, dbp); /* * If the freespace entry is now wrong, update it. */ - if (be16_to_cpu(free->bests[findex]) != be16_to_cpu(data->hdr.bestfree[0].length)) { - free->bests[findex] = data->hdr.bestfree[0].length; + bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */ + if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) { + bests[findex] = bf[0].length; logfree = 1; } /* * Log the freespace entry if needed. */ if (logfree) - xfs_dir2_free_log_bests(tp, fbp, findex, findex); - /* - * If the caller didn't hand us the freespace block, drop it. - */ - if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL) - xfs_da_buf_done(fbp); + xfs_dir2_free_log_bests(args, fbp, findex, findex); /* * Return the data block and offset in args, then drop the data block. */ args->blkno = (xfs_dablk_t)dbno; args->index = be16_to_cpu(*tagp); - xfs_da_buf_done(dbp); return 0; } /* * Lookup an entry in a node-format directory. - * All the real work happens in xfs_da_node_lookup_int. + * All the real work happens in xfs_da3_node_lookup_int. * The only real output is the inode number of the entry. */ int /* error */ @@ -1751,33 +2028,41 @@ xfs_dir2_node_lookup( int rval; /* operation return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_lookup", args); + trace_xfs_dir2_node_lookup(args); + /* * Allocate and initialize the btree cursor. */ state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; /* * Fill in the path to the entry in the cursor. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) rval = error; + else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { + /* If a CI match, dup the actual name and return EEXIST */ + xfs_dir2_data_entry_t *dep; + + dep = (xfs_dir2_data_entry_t *) + ((char *)state->extrablk.bp->b_addr + + state->extrablk.index); + rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen); + } /* * Release the btree blocks and leaf block. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } /* * Release the data block if we have it. */ if (state->extravalid && state->extrablk.bp) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } xfs_da_state_free(state); @@ -1789,36 +2074,33 @@ xfs_dir2_node_lookup( */ int /* error */ xfs_dir2_node_removename( - xfs_da_args_t *args) /* operation arguments */ + struct xfs_da_args *args) /* operation arguments */ { - xfs_da_state_blk_t *blk; /* leaf block */ + struct xfs_da_state_blk *blk; /* leaf block */ int error; /* error return value */ int rval; /* operation return value */ - xfs_da_state_t *state; /* btree cursor */ + struct xfs_da_state *state; /* btree cursor */ + + trace_xfs_dir2_node_removename(args); - xfs_dir2_trace_args("node_removename", args); /* * Allocate and initialize the btree cursor. */ state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; - /* - * Look up the entry we're deleting, set up the cursor. - */ - error = xfs_da_node_lookup_int(state, &rval); - if (error) { - rval = error; - } - /* - * Didn't find it, upper layer screwed up. - */ + + /* Look up the entry we're deleting, set up the cursor. */ + error = xfs_da3_node_lookup_int(state, &rval); + if (error) + goto out_free; + + /* Didn't find it, upper layer screwed up. */ if (rval != EEXIST) { - xfs_da_state_free(state); - return rval; + error = rval; + goto out_free; } + blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); ASSERT(state->extravalid); @@ -1828,23 +2110,23 @@ xfs_dir2_node_removename( */ error = xfs_dir2_leafn_remove(args, blk->bp, blk->index, &state->extrablk, &rval); - if (error) { - return error; - } + if (error) + goto out_free; /* * Fix the hash values up the btree. */ - xfs_da_fixhashpath(state, &state->path); + xfs_da3_fixhashpath(state, &state->path); /* * If we need to join leaf blocks, do it. */ if (rval && state->path.active > 1) - error = xfs_da_join(state); + error = xfs_da3_join(state); /* * If no errors so far, try conversion to leaf format. */ if (!error) error = xfs_dir2_node_to_leaf(state); +out_free: xfs_da_state_free(state); return error; } @@ -1857,7 +2139,7 @@ xfs_dir2_node_replace( xfs_da_args_t *args) /* operation arguments */ { xfs_da_state_blk_t *blk; /* leaf block */ - xfs_dir2_data_t *data; /* data block structure */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ xfs_dir2_data_entry_t *dep; /* data entry changed */ int error; /* error return value */ int i; /* btree level */ @@ -1867,20 +2149,19 @@ xfs_dir2_node_replace( int rval; /* internal return value */ xfs_da_state_t *state; /* btree cursor */ - xfs_dir2_trace_args("node_replace", args); + trace_xfs_dir2_node_replace(args); + /* * Allocate and initialize the btree cursor. */ state = xfs_da_state_alloc(); state->args = args; state->mp = args->dp->i_mount; - state->blocksize = state->mp->m_dirblksize; - state->node_ents = state->mp->m_dir_node_ents; inum = args->inumber; /* * Lookup the entry to change in the btree. */ - error = xfs_da_node_lookup_int(state, &rval); + error = xfs_da3_node_lookup_int(state, &rval); if (error) { rval = error; } @@ -1889,42 +2170,47 @@ xfs_dir2_node_replace( * and locked it. But paranoia is good. */ if (rval == EEXIST) { + struct xfs_dir2_leaf_entry *ents; /* * Find the leaf entry. */ blk = &state->path.blk[state->path.active - 1]; ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC); - leaf = blk->bp->data; - lep = &leaf->ents[blk->index]; + leaf = blk->bp->b_addr; + ents = args->dp->d_ops->leaf_ents_p(leaf); + lep = &ents[blk->index]; ASSERT(state->extravalid); /* * Point to the data entry. */ - data = state->extrablk.bp->data; - ASSERT(be32_to_cpu(data->hdr.magic) == XFS_DIR2_DATA_MAGIC); + hdr = state->extrablk.bp->b_addr; + ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || + hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC)); dep = (xfs_dir2_data_entry_t *) - ((char *)data + - XFS_DIR2_DATAPTR_TO_OFF(state->mp, be32_to_cpu(lep->address))); + ((char *)hdr + + xfs_dir2_dataptr_to_off(args->geo, + be32_to_cpu(lep->address))); ASSERT(inum != be64_to_cpu(dep->inumber)); /* * Fill in the new inode number and log the entry. */ dep->inumber = cpu_to_be64(inum); - xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep); + args->dp->d_ops->data_put_ftype(dep, args->filetype); + xfs_dir2_data_log_entry(args, state->extrablk.bp, dep); rval = 0; } /* * Didn't find it, and we're holding a data block. Drop it. */ else if (state->extravalid) { - xfs_da_brelse(args->trans, state->extrablk.bp); + xfs_trans_brelse(args->trans, state->extrablk.bp); state->extrablk.bp = NULL; } /* * Release all the buffers in the cursor. */ for (i = 0; i < state->path.active; i++) { - xfs_da_brelse(args->trans, state->path.blk[i].bp); + xfs_trans_brelse(args->trans, state->path.blk[i].bp); state->path.blk[i].bp = NULL; } xfs_da_state_free(state); @@ -1941,12 +2227,13 @@ xfs_dir2_node_trim_free( xfs_fileoff_t fo, /* free block number */ int *rvalp) /* out: did something */ { - xfs_dabuf_t *bp; /* freespace buffer */ + struct xfs_buf *bp; /* freespace buffer */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return code */ xfs_dir2_free_t *free; /* freespace structure */ xfs_mount_t *mp; /* filesystem mount point */ xfs_trans_t *tp; /* transaction pointer */ + struct xfs_dir3_icfree_hdr freehdr; dp = args->dp; mp = dp->i_mount; @@ -1954,41 +2241,39 @@ xfs_dir2_node_trim_free( /* * Read the freespace block. */ - if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp, - XFS_DATA_FORK))) { + error = xfs_dir2_free_try_read(tp, dp, fo, &bp); + if (error) return error; - } - /* * There can be holes in freespace. If fo is a hole, there's * nothing to do. */ - if (bp == NULL) { + if (!bp) return 0; - } - free = bp->data; - ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC); + free = bp->b_addr; + dp->d_ops->free_hdr_from_disk(&freehdr, free); + /* * If there are used entries, there's nothing to do. */ - if (be32_to_cpu(free->hdr.nused) > 0) { - xfs_da_brelse(tp, bp); + if (freehdr.nused > 0) { + xfs_trans_brelse(tp, bp); *rvalp = 0; return 0; } /* * Blow the block away. */ - if ((error = - xfs_dir2_shrink_inode(args, XFS_DIR2_DA_TO_DB(mp, (xfs_dablk_t)fo), - bp))) { + error = xfs_dir2_shrink_inode(args, + xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp); + if (error) { /* * Can't fail with ENOSPC since that only happens with no * space reservation, when breaking up an extent into two * pieces. This is the last block of an extent. */ ASSERT(error != ENOSPC); - xfs_da_brelse(tp, bp); + xfs_trans_brelse(tp, bp); return error; } /* diff --git a/fs/xfs/xfs_dir2_node.h b/fs/xfs/xfs_dir2_node.h deleted file mode 100644 index c7c870ee785..00000000000 --- a/fs/xfs/xfs_dir2_node.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_NODE_H__ -#define __XFS_DIR2_NODE_H__ - -/* - * Directory version 2, btree node format structures - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_da_state; -struct xfs_da_state_blk; -struct xfs_inode; -struct xfs_trans; - -/* - * Offset of the freespace index. - */ -#define XFS_DIR2_FREE_SPACE 2 -#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE) -#define XFS_DIR2_FREE_FIRSTDB(mp) \ - XFS_DIR2_BYTE_TO_DB(mp, XFS_DIR2_FREE_OFFSET) - -#define XFS_DIR2_FREE_MAGIC 0x58443246 /* XD2F */ - -typedef struct xfs_dir2_free_hdr { - __be32 magic; /* XFS_DIR2_FREE_MAGIC */ - __be32 firstdb; /* db of first entry */ - __be32 nvalid; /* count of valid entries */ - __be32 nused; /* count of used entries */ -} xfs_dir2_free_hdr_t; - -typedef struct xfs_dir2_free { - xfs_dir2_free_hdr_t hdr; /* block header */ - __be16 bests[1]; /* best free counts */ - /* unused entries are -1 */ -} xfs_dir2_free_t; - -#define XFS_DIR2_MAX_FREE_BESTS(mp) \ - (((mp)->m_dirblksize - (uint)sizeof(xfs_dir2_free_hdr_t)) / \ - (uint)sizeof(xfs_dir2_data_off_t)) - -/* - * Convert data space db to the corresponding free db. - */ -#define XFS_DIR2_DB_TO_FDB(mp,db) xfs_dir2_db_to_fdb(mp, db) -static inline xfs_dir2_db_t -xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return (XFS_DIR2_FREE_FIRSTDB(mp) + (db) / XFS_DIR2_MAX_FREE_BESTS(mp)); -} - -/* - * Convert data space db to the corresponding index in a free db. - */ -#define XFS_DIR2_DB_TO_FDINDEX(mp,db) xfs_dir2_db_to_fdindex(mp, db) -static inline int -xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db) -{ - return ((db) % XFS_DIR2_MAX_FREE_BESTS(mp)); -} - -extern void xfs_dir2_free_log_bests(struct xfs_trans *tp, struct xfs_dabuf *bp, - int first, int last); -extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, - struct xfs_dabuf *lbp); -extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_dabuf *bp, int *count); -extern int xfs_dir2_leafn_lookup_int(struct xfs_dabuf *bp, - struct xfs_da_args *args, int *indexp, - struct xfs_da_state *state); -extern int xfs_dir2_leafn_order(struct xfs_dabuf *leaf1_bp, - struct xfs_dabuf *leaf2_bp); -extern int xfs_dir2_leafn_split(struct xfs_da_state *state, - struct xfs_da_state_blk *oldblk, - struct xfs_da_state_blk *newblk); -extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); -extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, - struct xfs_da_state_blk *drop_blk, - struct xfs_da_state_blk *save_blk); -extern int xfs_dir2_node_addname(struct xfs_da_args *args); -extern int xfs_dir2_node_lookup(struct xfs_da_args *args); -extern int xfs_dir2_node_removename(struct xfs_da_args *args); -extern int xfs_dir2_node_replace(struct xfs_da_args *args); -extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, - int *rvalp); - -#endif /* __XFS_DIR2_NODE_H__ */ diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h new file mode 100644 index 00000000000..27ce0794d19 --- /dev/null +++ b/fs/xfs/xfs_dir2_priv.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DIR2_PRIV_H__ +#define __XFS_DIR2_PRIV_H__ + +struct dir_context; + +/* + * Directory offset/block conversion functions. + * + * DB blocks here are logical directory block numbers, not filesystem blocks. + */ + +/* + * Convert dataptr to byte in file space + */ +static inline xfs_dir2_off_t +xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp) +{ + return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG; +} + +/* + * Convert byte in file space to dataptr. It had better be aligned. + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by) +{ + return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG); +} + +/* + * Convert byte in space to (DB) block + */ +static inline xfs_dir2_db_t +xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_db_t)(by >> geo->blklog); +} + +/* + * Convert dataptr to a block number + */ +static inline xfs_dir2_db_t +xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert byte in space to offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1)); +} + +/* + * Convert dataptr to a byte offset in a block + */ +static inline xfs_dir2_data_aoff_t +xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp) +{ + return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp)); +} + +/* + * Convert block and offset to byte in space + */ +static inline xfs_dir2_off_t +xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return ((xfs_dir2_off_t)db << geo->blklog) + o; +} + +/* + * Convert block (DB) to block (dablk) + */ +static inline xfs_dablk_t +xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db) +{ + return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog)); +} + +/* + * Convert byte in space to (DA) block + */ +static inline xfs_dablk_t +xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by) +{ + return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by)); +} + +/* + * Convert block and offset to dataptr + */ +static inline xfs_dir2_dataptr_t +xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db, + xfs_dir2_data_aoff_t o) +{ + return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o)); +} + +/* + * Convert block (dablk) to block (DB) + */ +static inline xfs_dir2_db_t +xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog)); +} + +/* + * Convert block (dablk) to byte offset in space + */ +static inline xfs_dir2_off_t +xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da) +{ + return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0); +} + +/* + * Directory tail pointer accessor functions. Based on block geometry. + */ +static inline struct xfs_dir2_block_tail * +xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr) +{ + return ((struct xfs_dir2_block_tail *) + ((char *)hdr + geo->blksize)) - 1; +} + +static inline struct xfs_dir2_leaf_tail * +xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp) +{ + return (struct xfs_dir2_leaf_tail *) + ((char *)lp + geo->blksize - + sizeof(struct xfs_dir2_leaf_tail)); +} + +/* xfs_dir2.c */ +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); +extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, + xfs_dir2_db_t *dbp); +extern int xfs_dir_cilookup_result(struct xfs_da_args *args, + const unsigned char *name, int len); + +#define S_SHIFT 12 +extern const unsigned char xfs_mode_to_ftype[]; + +extern unsigned char xfs_dir3_get_dtype(struct xfs_mount *mp, + __uint8_t filetype); + + +/* xfs_dir2_block.c */ +extern int xfs_dir3_block_read(struct xfs_trans *tp, struct xfs_inode *dp, + struct xfs_buf **bpp); +extern int xfs_dir2_block_addname(struct xfs_da_args *args); +extern int xfs_dir2_block_lookup(struct xfs_da_args *args); +extern int xfs_dir2_block_removename(struct xfs_da_args *args); +extern int xfs_dir2_block_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args, + struct xfs_buf *lbp, struct xfs_buf *dbp); + +/* xfs_dir2_data.c */ +#ifdef DEBUG +#define xfs_dir3_data_check(dp,bp) __xfs_dir3_data_check(dp, bp); +#else +#define xfs_dir3_data_check(dp,bp) +#endif + +extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp); +extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp); +extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno, + xfs_daddr_t mapped_bno); + +extern struct xfs_dir2_data_free * +xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr, + struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup, + int *loghead); +extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno, + struct xfs_buf **bpp); + +/* xfs_dir2_leaf.c */ +extern int xfs_dir3_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); +extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args, + struct xfs_buf *dbp); +extern int xfs_dir2_leaf_addname(struct xfs_da_args *args); +extern void xfs_dir3_leaf_compact(struct xfs_da_args *args, + struct xfs_dir3_icleaf_hdr *leafhdr, struct xfs_buf *bp); +extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int *indexp, + int *lowstalep, int *highstalep, int *lowlogp, int *highlogp); +extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno, + struct xfs_buf **bpp, __uint16_t magic); +extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args, + struct xfs_buf *bp, int first, int last); +extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args, + struct xfs_buf *bp); +extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_leaf_removename(struct xfs_da_args *args); +extern int xfs_dir2_leaf_replace(struct xfs_da_args *args); +extern int xfs_dir2_leaf_search_hash(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern int xfs_dir2_leaf_trim_data(struct xfs_da_args *args, + struct xfs_buf *lbp, xfs_dir2_db_t db); +extern struct xfs_dir2_leaf_entry * +xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr, + struct xfs_dir2_leaf_entry *ents, int index, int compact, + int lowstale, int highstale, int *lfloglow, int *lfloghigh); +extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state); + +extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp, + struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf); + +/* xfs_dir2_node.c */ +extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args, + struct xfs_buf *lbp); +extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp, + struct xfs_buf *bp, int *count); +extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp, + struct xfs_da_args *args, int *indexp, + struct xfs_da_state *state); +extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp, + struct xfs_buf *leaf2_bp); +extern int xfs_dir2_leafn_split(struct xfs_da_state *state, + struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk); +extern int xfs_dir2_leafn_toosmall(struct xfs_da_state *state, int *action); +extern void xfs_dir2_leafn_unbalance(struct xfs_da_state *state, + struct xfs_da_state_blk *drop_blk, + struct xfs_da_state_blk *save_blk); +extern int xfs_dir2_node_addname(struct xfs_da_args *args); +extern int xfs_dir2_node_lookup(struct xfs_da_args *args); +extern int xfs_dir2_node_removename(struct xfs_da_args *args); +extern int xfs_dir2_node_replace(struct xfs_da_args *args); +extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo, + int *rvalp); +extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp, + xfs_dablk_t fbno, struct xfs_buf **bpp); + +/* xfs_dir2_sf.c */ +extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, + struct xfs_dir2_data_hdr *block, struct xfs_dir2_sf_hdr *sfhp); +extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_buf *bp, + int size, xfs_dir2_sf_hdr_t *sfhp); +extern int xfs_dir2_sf_addname(struct xfs_da_args *args); +extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); +extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); +extern int xfs_dir2_sf_removename(struct xfs_da_args *args); +extern int xfs_dir2_sf_replace(struct xfs_da_args *args); + +/* xfs_dir2_readdir.c */ +extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, + size_t bufsize); + +#endif /* __XFS_DIR2_PRIV_H__ */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c new file mode 100644 index 00000000000..48e99afb9cb --- /dev/null +++ b/fs/xfs/xfs_dir2_readdir.c @@ -0,0 +1,700 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_dinode.h" + +/* + * Directory file type support functions + */ +static unsigned char xfs_dir3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, + DT_FIFO, DT_SOCK, DT_LNK, DT_WHT, +}; + +unsigned char +xfs_dir3_get_dtype( + struct xfs_mount *mp, + __uint8_t filetype) +{ + if (!xfs_sb_version_hasftype(&mp->m_sb)) + return DT_UNKNOWN; + + if (filetype >= XFS_DIR3_FT_MAX) + return DT_UNKNOWN; + + return xfs_dir3_filetype_table[filetype]; +} +/* + * @mode, if set, indicates that the type field needs to be set up. + * This uses the transformation from file mode to DT_* as defined in linux/fs.h + * for file type specification. This will be propagated into the directory + * structure if appropriate for the given operation and filesystem config. + */ +const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { + [0] = XFS_DIR3_FT_UNKNOWN, + [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, + [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, + [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, +}; + +STATIC int +xfs_dir2_sf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + int i; /* shortform entry number */ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_dataptr_t off; /* current entry's offset */ + xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + xfs_dir2_dataptr_t dot_offset; + xfs_dir2_dataptr_t dotdot_offset; + xfs_ino_t ino; + struct xfs_da_geometry *geo = args->geo; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); + /* + * Give up if the directory is way too short. + */ + if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { + ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); + return XFS_ERROR(EIO); + } + + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + /* + * Precalculate offsets for . and .. as we will always need them. + * + * XXX(hch): the second argument is sometimes 0 and sometimes + * geo->datablk + */ + dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dot_offset); + dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + dp->d_ops->data_dotdot_offset); + + /* + * Put . entry unless we're starting past it. + */ + if (ctx->pos <= dot_offset) { + ctx->pos = dot_offset & 0x7fffffff; + if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR)) + return 0; + } + + /* + * Put .. entry unless we're starting past it. + */ + if (ctx->pos <= dotdot_offset) { + ino = dp->d_ops->sf_get_parent_ino(sfp); + ctx->pos = dotdot_offset & 0x7fffffff; + if (!dir_emit(ctx, "..", 2, ino, DT_DIR)) + return 0; + } + + /* + * Loop while there are more entries and put'ing works. + */ + sfep = xfs_dir2_sf_firstentry(sfp); + for (i = 0; i < sfp->count; i++) { + __uint8_t filetype; + + off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + xfs_dir2_sf_get_offset(sfep)); + + if (ctx->pos > off) { + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + continue; + } + + ino = dp->d_ops->sf_get_ino(sfp, sfep); + filetype = dp->d_ops->sf_get_ftype(sfep); + ctx->pos = off & 0x7fffffff; + if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, + xfs_dir3_get_dtype(dp->i_mount, filetype))) + return 0; + sfep = dp->d_ops->sf_nextentry(sfp, sfep); + } + + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + return 0; +} + +/* + * Readdir for block directories. + */ +STATIC int +xfs_dir2_block_getdents( + struct xfs_da_args *args, + struct dir_context *ctx) +{ + struct xfs_inode *dp = args->dp; /* incore directory inode */ + xfs_dir2_data_hdr_t *hdr; /* block header */ + struct xfs_buf *bp; /* buffer for block */ + xfs_dir2_block_tail_t *btp; /* block tail */ + xfs_dir2_data_entry_t *dep; /* block data entry */ + xfs_dir2_data_unused_t *dup; /* block unused entry */ + char *endptr; /* end of the data entries */ + int error; /* error return value */ + char *ptr; /* current data entry */ + int wantoff; /* starting block offset */ + xfs_off_t cook; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the block number in the offset is out of range, we're done. + */ + if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk) + return 0; + + error = xfs_dir3_block_read(NULL, dp, &bp); + if (error) + return error; + + /* + * Extract the byte offset we start at from the seek pointer. + * We'll skip entries before this. + */ + wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Set up values for the loop. + */ + btp = xfs_dir2_block_tail_p(geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + + /* + * Loop over the data portion of the block. + * Each object is a real entry (dep) or an unused one (dup). + */ + while (ptr < endptr) { + __uint8_t filetype; + + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * Unused, skip it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + ptr += be16_to_cpu(dup->length); + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + + /* + * Bump pointer for the next iteration. + */ + ptr += dp->d_ops->data_entsize(dep->namelen); + /* + * The entry is before the desired starting point, skip it. + */ + if ((char *)dep - (char *)hdr < wantoff) + continue; + + cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk, + (char *)dep - (char *)hdr); + + ctx->pos = cook & 0x7fffffff; + filetype = dp->d_ops->data_get_ftype(dep); + /* + * If it didn't fit, set the final offset to here & return. + */ + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) { + xfs_trans_brelse(NULL, bp); + return 0; + } + } + + /* + * Reached the end of the block. + * Set the offset to a non-existent block 1 and return. + */ + ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) & + 0x7fffffff; + xfs_trans_brelse(NULL, bp); + return 0; +} + +struct xfs_dir2_leaf_map_info { + xfs_extlen_t map_blocks; /* number of fsbs in map */ + xfs_dablk_t map_off; /* last mapped file offset */ + int map_size; /* total entries in *map */ + int map_valid; /* valid entries in *map */ + int nmap; /* mappings to ask xfs_bmapi */ + xfs_dir2_db_t curdb; /* db for current block */ + int ra_current; /* number of read-ahead blks */ + int ra_index; /* *map index for read-ahead */ + int ra_offset; /* map entry offset for ra */ + int ra_want; /* readahead count wanted */ + struct xfs_bmbt_irec map[]; /* map vector for blocks */ +}; + +STATIC int +xfs_dir2_leaf_readbuf( + struct xfs_da_args *args, + size_t bufsize, + struct xfs_dir2_leaf_map_info *mip, + xfs_dir2_off_t *curoff, + struct xfs_buf **bpp) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = *bpp; + struct xfs_bmbt_irec *map = mip->map; + struct blk_plug plug; + int error = 0; + int length; + int i; + int j; + struct xfs_da_geometry *geo = args->geo; + + /* + * If we have a buffer, we need to release it and + * take it out of the mapping. + */ + + if (bp) { + xfs_trans_brelse(NULL, bp); + bp = NULL; + mip->map_blocks -= geo->fsbcount; + /* + * Loop to get rid of the extents for the + * directory block. + */ + for (i = geo->fsbcount; i > 0; ) { + j = min_t(int, map->br_blockcount, i); + map->br_blockcount -= j; + map->br_startblock += j; + map->br_startoff += j; + /* + * If mapping is done, pitch it from + * the table. + */ + if (!map->br_blockcount && --mip->map_valid) + memmove(&map[0], &map[1], + sizeof(map[0]) * mip->map_valid); + i -= j; + } + } + + /* + * Recalculate the readahead blocks wanted. + */ + mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1; + ASSERT(mip->ra_want >= 0); + + /* + * If we don't have as many as we want, and we haven't + * run out of data blocks, get some more mappings. + */ + if (1 + mip->ra_want > mip->map_blocks && + mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) { + /* + * Get more bmaps, fill in after the ones + * we already have in the table. + */ + mip->nmap = mip->map_size - mip->map_valid; + error = xfs_bmapi_read(dp, mip->map_off, + xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) - + mip->map_off, + &map[mip->map_valid], &mip->nmap, 0); + + /* + * Don't know if we should ignore this or try to return an + * error. The trouble with returning errors is that readdir + * will just stop without actually passing the error through. + */ + if (error) + goto out; /* XXX */ + + /* + * If we got all the mappings we asked for, set the final map + * offset based on the last bmap value received. Otherwise, + * we've reached the end. + */ + if (mip->nmap == mip->map_size - mip->map_valid) { + i = mip->map_valid + mip->nmap - 1; + mip->map_off = map[i].br_startoff + map[i].br_blockcount; + } else + mip->map_off = xfs_dir2_byte_to_da(geo, + XFS_DIR2_LEAF_OFFSET); + + /* + * Look for holes in the mapping, and eliminate them. Count up + * the valid blocks. + */ + for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) { + if (map[i].br_startblock == HOLESTARTBLOCK) { + mip->nmap--; + length = mip->map_valid + mip->nmap - i; + if (length) + memmove(&map[i], &map[i + 1], + sizeof(map[i]) * length); + } else { + mip->map_blocks += map[i].br_blockcount; + i++; + } + } + mip->map_valid += mip->nmap; + } + + /* + * No valid mappings, so no more data blocks. + */ + if (!mip->map_valid) { + *curoff = xfs_dir2_da_to_byte(geo, mip->map_off); + goto out; + } + + /* + * Read the directory block starting at the first mapping. + */ + mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff); + error = xfs_dir3_data_read(NULL, dp, map->br_startoff, + map->br_blockcount >= geo->fsbcount ? + XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) : + -1, &bp); + /* + * Should just skip over the data block instead of giving up. + */ + if (error) + goto out; /* XXX */ + + /* + * Adjust the current amount of read-ahead: we just read a block that + * was previously ra. + */ + if (mip->ra_current) + mip->ra_current -= geo->fsbcount; + + /* + * Do we need more readahead? + */ + blk_start_plug(&plug); + for (mip->ra_index = mip->ra_offset = i = 0; + mip->ra_want > mip->ra_current && i < mip->map_blocks; + i += geo->fsbcount) { + ASSERT(mip->ra_index < mip->map_valid); + /* + * Read-ahead a contiguous directory block. + */ + if (i > mip->ra_current && + map[mip->ra_index].br_blockcount >= geo->fsbcount) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + mip->ra_offset, + XFS_FSB_TO_DADDR(dp->i_mount, + map[mip->ra_index].br_startblock + + mip->ra_offset)); + mip->ra_current = i; + } + + /* + * Read-ahead a non-contiguous directory block. This doesn't + * use our mapping, but this is a very rare case. + */ + else if (i > mip->ra_current) { + xfs_dir3_data_readahead(dp, + map[mip->ra_index].br_startoff + + mip->ra_offset, -1); + mip->ra_current = i; + } + + /* + * Advance offset through the mapping table. + */ + for (j = 0; j < geo->fsbcount; j += length ) { + /* + * The rest of this extent but not more than a dir + * block. + */ + length = min_t(int, geo->fsbcount, + map[mip->ra_index].br_blockcount - + mip->ra_offset); + mip->ra_offset += length; + + /* + * Advance to the next mapping if this one is used up. + */ + if (mip->ra_offset == map[mip->ra_index].br_blockcount) { + mip->ra_offset = 0; + mip->ra_index++; + } + } + } + blk_finish_plug(&plug); + +out: + *bpp = bp; + return error; +} + +/* + * Getdents (readdir) for leaf and node directories. + * This reads the data blocks only, so is the same for both forms. + */ +STATIC int +xfs_dir2_leaf_getdents( + struct xfs_da_args *args, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_inode *dp = args->dp; + struct xfs_buf *bp = NULL; /* data block buffer */ + xfs_dir2_data_hdr_t *hdr; /* data block header */ + xfs_dir2_data_entry_t *dep; /* data entry */ + xfs_dir2_data_unused_t *dup; /* unused entry */ + int error = 0; /* error return value */ + int length; /* temporary length value */ + int byteoff; /* offset in current block */ + xfs_dir2_off_t curoff; /* current overall offset */ + xfs_dir2_off_t newoff; /* new curoff after new blk */ + char *ptr = NULL; /* pointer to current data */ + struct xfs_dir2_leaf_map_info *map_info; + struct xfs_da_geometry *geo = args->geo; + + /* + * If the offset is at or past the largest allowed value, + * give up right away. + */ + if (ctx->pos >= XFS_DIR2_MAX_DATAPTR) + return 0; + + /* + * Set up to bmap a number of blocks based on the caller's + * buffer size, the directory block size, and the filesystem + * block size. + */ + length = howmany(bufsize + geo->blksize, (1 << geo->fsblog)); + map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) + + (length * sizeof(struct xfs_bmbt_irec)), + KM_SLEEP | KM_NOFS); + map_info->map_size = length; + + /* + * Inside the loop we keep the main offset value as a byte offset + * in the directory file. + */ + curoff = xfs_dir2_dataptr_to_byte(ctx->pos); + + /* + * Force this conversion through db so we truncate the offset + * down to get the start of the data block. + */ + map_info->map_off = xfs_dir2_db_to_da(geo, + xfs_dir2_byte_to_db(geo, curoff)); + + /* + * Loop over directory entries until we reach the end offset. + * Get more blocks and readahead as necessary. + */ + while (curoff < XFS_DIR2_LEAF_OFFSET) { + __uint8_t filetype; + + /* + * If we have no buffer, or we're off the end of the + * current buffer, need to get another one. + */ + if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) { + + error = xfs_dir2_leaf_readbuf(args, bufsize, map_info, + &curoff, &bp); + if (error || !map_info->map_valid) + break; + + /* + * Having done a read, we need to set a new offset. + */ + newoff = xfs_dir2_db_off_to_byte(geo, + map_info->curdb, 0); + /* + * Start of the current block. + */ + if (curoff < newoff) + curoff = newoff; + /* + * Make sure we're in the right block. + */ + else if (curoff > newoff) + ASSERT(xfs_dir2_byte_to_db(geo, curoff) == + map_info->curdb); + hdr = bp->b_addr; + xfs_dir3_data_check(dp, bp); + /* + * Find our position in the block. + */ + ptr = (char *)dp->d_ops->data_entry_p(hdr); + byteoff = xfs_dir2_byte_to_off(geo, curoff); + /* + * Skip past the header. + */ + if (byteoff == 0) + curoff += dp->d_ops->data_entry_offset; + /* + * Skip past entries until we reach our offset. + */ + else { + while ((char *)ptr - (char *)hdr < byteoff) { + dup = (xfs_dir2_data_unused_t *)ptr; + + if (be16_to_cpu(dup->freetag) + == XFS_DIR2_DATA_FREE_TAG) { + + length = be16_to_cpu(dup->length); + ptr += length; + continue; + } + dep = (xfs_dir2_data_entry_t *)ptr; + length = + dp->d_ops->data_entsize(dep->namelen); + ptr += length; + } + /* + * Now set our real offset. + */ + curoff = + xfs_dir2_db_off_to_byte(geo, + xfs_dir2_byte_to_db(geo, curoff), + (char *)ptr - (char *)hdr); + if (ptr >= (char *)hdr + geo->blksize) { + continue; + } + } + } + /* + * We have a pointer to an entry. + * Is it a live one? + */ + dup = (xfs_dir2_data_unused_t *)ptr; + /* + * No, it's unused, skip over it. + */ + if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { + length = be16_to_cpu(dup->length); + ptr += length; + curoff += length; + continue; + } + + dep = (xfs_dir2_data_entry_t *)ptr; + length = dp->d_ops->data_entsize(dep->namelen); + filetype = dp->d_ops->data_get_ftype(dep); + + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, + be64_to_cpu(dep->inumber), + xfs_dir3_get_dtype(dp->i_mount, filetype))) + break; + + /* + * Advance to next entry in the block. + */ + ptr += length; + curoff += length; + /* bufsize may have just been a guess; don't go negative */ + bufsize = bufsize > length ? bufsize - length : 0; + } + + /* + * All done. Set output offset value to current offset. + */ + if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR)) + ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff; + else + ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff; + kmem_free(map_info); + if (bp) + xfs_trans_brelse(NULL, bp); + return error; +} + +/* + * Read a directory. + */ +int +xfs_readdir( + struct xfs_inode *dp, + struct dir_context *ctx, + size_t bufsize) +{ + struct xfs_da_args args = { NULL }; + int rval; + int v; + uint lock_mode; + + trace_xfs_readdir(dp); + + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); + + ASSERT(S_ISDIR(dp->i_d.di_mode)); + XFS_STATS_INC(xs_dir_getdents); + + args.dp = dp; + args.geo = dp->i_mount->m_dir_geo; + + lock_mode = xfs_ilock_data_map_shared(dp); + if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) + rval = xfs_dir2_sf_getdents(&args, ctx); + else if ((rval = xfs_dir2_isblock(&args, &v))) + ; + else if (v) + rval = xfs_dir2_block_getdents(&args, ctx); + else + rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); + xfs_iunlock(dp, lock_mode); + + return rval; +} diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c index 0cd77b17bf9..53c3be619db 100644 --- a/fs/xfs/xfs_dir2_sf.c +++ b/fs/xfs/xfs_dir2_sf.c @@ -17,26 +17,22 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_da_format.h" #include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" #include "xfs_error.h" -#include "xfs_dir2_data.h" -#include "xfs_dir2_leaf.h" -#include "xfs_dir2_block.h" -#include "xfs_dir2_trace.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_trace.h" +#include "xfs_dinode.h" /* * Prototypes for internal functions. @@ -69,7 +65,7 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args); int /* size for sf form */ xfs_dir2_block_sfsize( xfs_inode_t *dp, /* incore inode pointer */ - xfs_dir2_block_t *block, /* block directory data */ + xfs_dir2_data_hdr_t *hdr, /* block directory data */ xfs_dir2_sf_hdr_t *sfhp) /* output: header for sf form */ { xfs_dir2_dataptr_t addr; /* data entry address */ @@ -85,12 +81,21 @@ xfs_dir2_block_sfsize( int namelen; /* total name bytes */ xfs_ino_t parent = 0; /* parent inode number */ int size=0; /* total computed size */ + int has_ftype; + struct xfs_da_geometry *geo; mp = dp->i_mount; + geo = mp->m_dir_geo; + + /* + * if there is a filetype field, add the extra byte to the namelen + * for each entry that we see. + */ + has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; count = i8count = namelen = 0; - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - blp = XFS_DIR2_BLOCK_LEAF_P(btp); + btp = xfs_dir2_block_tail_p(geo, hdr); + blp = xfs_dir2_block_leaf_p(btp); /* * Iterate over the block's data entries by using the leaf pointers. @@ -101,8 +106,8 @@ xfs_dir2_block_sfsize( /* * Calculate the pointer to the entry at hand. */ - dep = (xfs_dir2_data_entry_t *) - ((char *)block + XFS_DIR2_DATAPTR_TO_OFF(mp, addr)); + dep = (xfs_dir2_data_entry_t *)((char *)hdr + + xfs_dir2_dataptr_to_off(geo, addr)); /* * Detect . and .., so we can special-case them. * . is not included in sf directories. @@ -116,15 +121,16 @@ xfs_dir2_block_sfsize( if (!isdot) i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; #endif + /* take into account the file type field */ if (!isdot && !isdotdot) { count++; - namelen += dep->namelen; + namelen += dep->namelen + has_ftype; } else if (isdotdot) parent = be64_to_cpu(dep->inumber); /* * Calculate the new size, see if we should give up yet. */ - size = XFS_DIR2_SF_HDR_SIZE(i8count) + /* header */ + size = xfs_dir2_sf_hdr_size(i8count) + /* header */ count + /* namelen */ count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ namelen + /* name */ @@ -139,7 +145,7 @@ xfs_dir2_block_sfsize( */ sfhp->count = count; sfhp->i8count = i8count; - XFS_DIR2_SF_PUT_INUMBER((xfs_dir2_sf_t *)sfhp, &parent, &sfhp->parent); + dp->d_ops->sf_put_parent_ino(sfhp, parent); return size; } @@ -150,11 +156,11 @@ xfs_dir2_block_sfsize( int /* error */ xfs_dir2_block_to_sf( xfs_da_args_t *args, /* operation arguments */ - xfs_dabuf_t *bp, /* block buffer */ + struct xfs_buf *bp, int size, /* shortform directory size */ xfs_dir2_sf_hdr_t *sfhp) /* shortform directory hdr */ { - xfs_dir2_block_t *block; /* block structure */ + xfs_dir2_data_hdr_t *hdr; /* block header */ xfs_dir2_block_tail_t *btp; /* block tail pointer */ xfs_dir2_data_entry_t *dep; /* data entry pointer */ xfs_inode_t *dp; /* incore directory inode */ @@ -165,49 +171,36 @@ xfs_dir2_block_to_sf( xfs_mount_t *mp; /* filesystem mount point */ char *ptr; /* current data pointer */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_ino_t temp; + xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */ + xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */ + + trace_xfs_dir2_block_to_sf(args); - xfs_dir2_trace_args_sb("block_to_sf", args, size, bp); dp = args->dp; mp = dp->i_mount; /* - * Make a copy of the block data, so we can shrink the inode - * and add local data. + * allocate a temporary destination buffer the size of the inode + * to format the data into. Once we have formatted the data, we + * can free the block and copy the formatted data into the inode literal + * area. */ - block = kmem_alloc(mp->m_dirblksize, KM_SLEEP); - memcpy(block, bp->data, mp->m_dirblksize); - logflags = XFS_ILOG_CORE; - if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) { - ASSERT(error != ENOSPC); - goto out; - } - /* - * The buffer is now unconditionally gone, whether - * xfs_dir2_shrink_inode worked or not. - * - * Convert the inode to local format. - */ - dp->i_df.if_flags &= ~XFS_IFEXTENTS; - dp->i_df.if_flags |= XFS_IFINLINE; - dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; - ASSERT(dp->i_df.if_bytes == 0); - xfs_idata_realloc(dp, size, XFS_DATA_FORK); - logflags |= XFS_ILOG_DDATA; + dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP); + hdr = bp->b_addr; + /* * Copy the header into the newly allocate local space. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - memcpy(sfp, sfhp, XFS_DIR2_SF_HDR_SIZE(sfhp->i8count)); - dp->i_d.di_size = size; + sfp = (xfs_dir2_sf_hdr_t *)dst; + memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count)); + /* * Set up to loop over the block's entries. */ - btp = XFS_DIR2_BLOCK_TAIL_P(mp, block); - ptr = (char *)block->u; - endptr = (char *)XFS_DIR2_BLOCK_LEAF_P(btp); - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + btp = xfs_dir2_block_tail_p(args->geo, hdr); + ptr = (char *)dp->d_ops->data_entry_p(hdr); + endptr = (char *)xfs_dir2_block_leaf_p(btp); + sfep = xfs_dir2_sf_firstentry(sfp); /* * Loop over the active and unused entries. * Stop when we reach the leaf/tail portion of the block. @@ -233,28 +226,54 @@ xfs_dir2_block_to_sf( else if (dep->namelen == 2 && dep->name[0] == '.' && dep->name[1] == '.') ASSERT(be64_to_cpu(dep->inumber) == - XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent)); + dp->d_ops->sf_get_parent_ino(sfp)); /* * Normal entry, copy it into shortform. */ else { sfep->namelen = dep->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, + xfs_dir2_sf_put_offset(sfep, (xfs_dir2_data_aoff_t) - ((char *)dep - (char *)block)); + ((char *)dep - (char *)hdr)); memcpy(sfep->name, dep->name, dep->namelen); - temp = be64_to_cpu(dep->inumber); - XFS_DIR2_SF_PUT_INUMBER(sfp, &temp, - XFS_DIR2_SF_INUMBERP(sfep)); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + dp->d_ops->sf_put_ino(sfp, sfep, + be64_to_cpu(dep->inumber)); + dp->d_ops->sf_put_ftype(sfep, + dp->d_ops->data_get_ftype(dep)); + + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } - ptr += XFS_DIR2_DATA_ENTSIZE(dep->namelen); + ptr += dp->d_ops->data_entsize(dep->namelen); } ASSERT((char *)sfep - (char *)sfp == size); + + /* now we are done with the block, we can shrink the inode */ + logflags = XFS_ILOG_CORE; + error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); + if (error) { + ASSERT(error != ENOSPC); + goto out; + } + + /* + * The buffer is now unconditionally gone, whether + * xfs_dir2_shrink_inode worked or not. + * + * Convert the inode to local format and copy the data in. + */ + dp->i_df.if_flags &= ~XFS_IFEXTENTS; + dp->i_df.if_flags |= XFS_IFINLINE; + dp->i_d.di_format = XFS_DINODE_FMT_LOCAL; + ASSERT(dp->i_df.if_bytes == 0); + xfs_idata_realloc(dp, size, XFS_DATA_FORK); + + logflags |= XFS_ILOG_DDATA; + memcpy(dp->i_df.if_u1.if_data, dst, size); + dp->i_d.di_size = size; xfs_dir2_sf_check(args); out: xfs_trans_log_inode(args->trans, dp, logflags); - kmem_free(block, mp->m_dirblksize); + kmem_free(dst); return error; } @@ -268,19 +287,18 @@ int /* error */ xfs_dir2_sf_addname( xfs_da_args_t *args) /* operation arguments */ { - int add_entsize; /* size of the new entry */ xfs_inode_t *dp; /* incore directory inode */ int error; /* error return value */ int incr_isize; /* total change in size */ int new_isize; /* di_size after adding name */ int objchange; /* changing to 8-byte inodes */ xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */ - int old_isize; /* di_size before adding name */ int pick; /* which algorithm to use */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */ - xfs_dir2_trace_args("sf_addname", args); + trace_xfs_dir2_sf_addname(args); + ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -293,34 +311,29 @@ xfs_dir2_sf_addname( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Compute entry (and change in) size. */ - add_entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); - incr_isize = add_entsize; + incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); objchange = 0; #if XFS_BIG_INUMS /* * Do we have to change to 8 byte inodes? */ - if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { /* - * Yes, adjust the entry size and the total size. + * Yes, adjust the inode size. old count + (parent + new) */ - add_entsize += - (uint)sizeof(xfs_dir2_ino8_t) - - (uint)sizeof(xfs_dir2_ino4_t); incr_isize += - (sfp->hdr.count + 2) * + (sfp->count + 2) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); objchange = 1; } #endif - old_isize = (int)dp->i_d.di_size; - new_isize = old_isize + incr_isize; + new_isize = (int)dp->i_d.di_size + incr_isize; /* * Won't fit as shortform any more (due to size), * or the pick routine says it won't (due to offset values). @@ -331,7 +344,7 @@ xfs_dir2_sf_addname( /* * Just checking or no space reservation, it doesn't fit. */ - if (args->justcheck || args->total == 0) + if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) return XFS_ERROR(ENOSPC); /* * Convert to block form then add the name. @@ -344,7 +357,7 @@ xfs_dir2_sf_addname( /* * Just checking, it fits. */ - if (args->justcheck) + if (args->op_flags & XFS_DA_OP_JUSTCHECK) return 0; /* * Do it the easy way - just add it at the end. @@ -383,37 +396,38 @@ xfs_dir2_sf_addname_easy( { int byteoff; /* byte offset in sf dir */ xfs_inode_t *dp; /* incore directory inode */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ dp = args->dp; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; byteoff = (int)((char *)sfep - (char *)sfp); /* * Grow the in-inode space. */ - xfs_idata_realloc(dp, XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen), - XFS_DATA_FORK); + xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen), + XFS_DATA_FORK); /* * Need to set up again due to realloc of the inode data. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff); /* * Fill in the new entry. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + /* * Update the header and inode. */ - sfp->hdr.count++; + sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) - sfp->hdr.i8count++; + sfp->i8count++; #endif dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); @@ -443,34 +457,36 @@ xfs_dir2_sf_addname_hard( xfs_dir2_data_aoff_t offset; /* current offset value */ int old_isize; /* previous di_size */ xfs_dir2_sf_entry_t *oldsfep; /* entry in original dir */ - xfs_dir2_sf_t *oldsfp; /* original shortform dir */ + xfs_dir2_sf_hdr_t *oldsfp; /* original shortform dir */ xfs_dir2_sf_entry_t *sfep; /* entry in new dir */ - xfs_dir2_sf_t *sfp; /* new shortform dir */ + xfs_dir2_sf_hdr_t *sfp; /* new shortform dir */ + struct xfs_mount *mp; /* * Copy the old directory to the stack buffer. */ dp = args->dp; + mp = dp->i_mount; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; old_isize = (int)dp->i_d.di_size; buf = kmem_alloc(old_isize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)buf; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; memcpy(oldsfp, sfp, old_isize); /* * Loop over the old directory finding the place we're going * to insert the new entry. * If it's going to end up at the end then oldsfep will point there. */ - for (offset = XFS_DIR2_DATA_FIRST_OFFSET, - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp), - add_datasize = XFS_DIR2_DATA_ENTSIZE(args->namelen), + for (offset = dp->d_ops->data_first_offset, + oldsfep = xfs_dir2_sf_firstentry(oldsfp), + add_datasize = dp->d_ops->data_entsize(args->namelen), eof = (char *)oldsfep == &buf[old_isize]; !eof; - offset = new_offset + XFS_DIR2_DATA_ENTSIZE(oldsfep->namelen), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep), + offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep), eof = (char *)oldsfep == &buf[old_isize]) { - new_offset = XFS_DIR2_SF_GET_OFFSET(oldsfep); + new_offset = xfs_dir2_sf_get_offset(oldsfep); if (offset + add_datasize <= new_offset) break; } @@ -484,7 +500,7 @@ xfs_dir2_sf_addname_hard( /* * Reset the pointer since the buffer was reallocated. */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Copy the first part of the directory, including the header. */ @@ -495,23 +511,23 @@ xfs_dir2_sf_addname_hard( * Fill in the new entry, and update the header counts. */ sfep->namelen = args->namelen; - XFS_DIR2_SF_PUT_OFFSET(sfep, offset); + xfs_dir2_sf_put_offset(sfep, offset); memcpy(sfep->name, args->name, sfep->namelen); - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); - sfp->hdr.count++; + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); + sfp->count++; #if XFS_BIG_INUMS if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) - sfp->hdr.i8count++; + sfp->i8count++; #endif /* * If there's more left to copy, do that. */ if (!eof) { - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); memcpy(sfep, oldsfep, old_isize - nbytes); } - kmem_free(buf, old_isize); + kmem_free(buf); dp->i_d.di_size = new_isize; xfs_dir2_sf_check(args); } @@ -536,43 +552,43 @@ xfs_dir2_sf_addname_pick( xfs_mount_t *mp; /* filesystem mount point */ xfs_dir2_data_aoff_t offset; /* data block offset */ xfs_dir2_sf_entry_t *sfep; /* shortform entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ int size; /* entry's data size */ int used; /* data bytes used */ dp = args->dp; mp = dp->i_mount; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - size = XFS_DIR2_DATA_ENTSIZE(args->namelen); - offset = XFS_DIR2_DATA_FIRST_OFFSET; - sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + size = dp->d_ops->data_entsize(args->namelen); + offset = dp->d_ops->data_first_offset; + sfep = xfs_dir2_sf_firstentry(sfp); holefit = 0; /* * Loop over sf entries. * Keep track of data offset and whether we've seen a place * to insert the new entry. */ - for (i = 0; i < sfp->hdr.count; i++) { + for (i = 0; i < sfp->count; i++) { if (!holefit) - holefit = offset + size <= XFS_DIR2_SF_GET_OFFSET(sfep); - offset = XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); - sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep); + holefit = offset + size <= xfs_dir2_sf_get_offset(sfep); + offset = xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + sfep = dp->d_ops->sf_nextentry(sfp, sfep); } /* * Calculate data bytes used excluding the new entry, if this * was a data block (block form directory). */ used = offset + - (sfp->hdr.count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (sfp->count + 3) * (uint)sizeof(xfs_dir2_leaf_entry_t) + (uint)sizeof(xfs_dir2_block_tail_t); /* * If it won't fit in a block form then we can't insert it, * we'll go back, convert to block, then try the insert and convert * to leaf. */ - if (used + (holefit ? 0 : size) > mp->m_dirblksize) + if (used + (holefit ? 0 : size) > args->geo->blksize) return 0; /* * If changing the inode number size, do it the hard way. @@ -587,7 +603,7 @@ xfs_dir2_sf_addname_pick( /* * If it won't fit at the end then do it the hard way (use the hole). */ - if (used + size > mp->m_dirblksize) + if (used + size > args->geo->blksize) return 2; /* * Do it the easy way. @@ -611,32 +627,34 @@ xfs_dir2_sf_check( xfs_ino_t ino; /* entry inode number */ int offset; /* data offset */ xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + struct xfs_mount *mp; dp = args->dp; + mp = dp->i_mount; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - offset = XFS_DIR2_DATA_FIRST_OFFSET; - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + offset = dp->d_ops->data_first_offset; + ino = dp->d_ops->sf_get_parent_ino(sfp); i8count = ino > XFS_DIR2_MAX_SHORT_INUM; - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - ASSERT(XFS_DIR2_SF_GET_OFFSET(sfep) >= offset); - ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset); + ino = dp->d_ops->sf_get_ino(sfp, sfep); i8count += ino > XFS_DIR2_MAX_SHORT_INUM; offset = - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(sfep->namelen); + xfs_dir2_sf_get_offset(sfep) + + dp->d_ops->data_entsize(sfep->namelen); + ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); } - ASSERT(i8count == sfp->hdr.i8count); + ASSERT(i8count == sfp->i8count); ASSERT(XFS_BIG_INUMS || i8count == 0); ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); ASSERT(offset + - (sfp->hdr.count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + - (uint)sizeof(xfs_dir2_block_tail_t) <= - dp->i_mount->m_dirblksize); + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + + (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize); } #endif /* DEBUG */ @@ -650,10 +668,11 @@ xfs_dir2_sf_create( { xfs_inode_t *dp; /* incore directory inode */ int i8count; /* parent inode is an 8-byte number */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ int size; /* directory size */ - xfs_dir2_trace_args_i("sf_create", args, pino); + trace_xfs_dir2_sf_create(args); + dp = args->dp; ASSERT(dp != NULL); @@ -671,7 +690,7 @@ xfs_dir2_sf_create( ASSERT(dp->i_df.if_flags & XFS_IFINLINE); ASSERT(dp->i_df.if_bytes == 0); i8count = pino > XFS_DIR2_MAX_SHORT_INUM; - size = XFS_DIR2_SF_HDR_SIZE(i8count); + size = xfs_dir2_sf_hdr_size(i8count); /* * Make a buffer for the data. */ @@ -679,164 +698,19 @@ xfs_dir2_sf_create( /* * Fill in the header, */ - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - sfp->hdr.i8count = i8count; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + sfp->i8count = i8count; /* * Now can put in the inode number, since i8count is set. */ - XFS_DIR2_SF_PUT_INUMBER(sfp, &pino, &sfp->hdr.parent); - sfp->hdr.count = 0; + dp->d_ops->sf_put_parent_ino(sfp, pino); + sfp->count = 0; dp->i_d.di_size = size; xfs_dir2_sf_check(args); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); return 0; } -int /* error */ -xfs_dir2_sf_getdents( - xfs_inode_t *dp, /* incore directory inode */ - uio_t *uio, /* caller's buffer control */ - int *eofp, /* eof reached? (out) */ - xfs_dirent_t *dbp, /* caller's buffer */ - xfs_dir2_put_t put) /* abi's formatting function */ -{ - int error; /* error return value */ - int i; /* shortform entry number */ - xfs_mount_t *mp; /* filesystem mount point */ - xfs_dir2_dataptr_t off; /* current entry's offset */ - xfs_dir2_put_args_t p; /* arg package for put rtn */ - xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ - xfs_off_t dir_offset; - - mp = dp->i_mount; - - ASSERT(dp->i_df.if_flags & XFS_IFINLINE); - /* - * Give up if the directory is way too short. - */ - if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - return XFS_ERROR(EIO); - } - - dir_offset = uio->uio_offset; - - ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); - ASSERT(dp->i_df.if_u1.if_data != NULL); - - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); - - /* - * If the block number in the offset is out of range, we're done. - */ - if (XFS_DIR2_DATAPTR_TO_DB(mp, dir_offset) > mp->m_dirdatablk) { - *eofp = 1; - return 0; - } - - /* - * Set up putargs structure. - */ - p.dbp = dbp; - p.put = put; - p.uio = uio; - /* - * Put . entry unless we're starting past it. - */ - if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, 0, - XFS_DIR2_DATA_DOTDOT_OFFSET); - p.ino = dp->i_ino; -#if XFS_BIG_INUMS - p.ino += mp->m_inoadd; -#endif - p.name = "."; - p.namelen = 1; - - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOT_OFFSET); - return error; - } - } - - /* - * Put .. entry unless we're starting past it. - */ - if (dir_offset <= - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET)) { - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_FIRST_OFFSET); - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); -#if XFS_BIG_INUMS - p.ino += mp->m_inoadd; -#endif - p.name = ".."; - p.namelen = 2; - - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_DATA_DOTDOT_OFFSET); - return error; - } - } - - /* - * Loop while there are more entries and put'ing works. - */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - - off = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep)); - - if (dir_offset > off) - continue; - - p.namelen = sfep->namelen; - - p.cook = XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk, - XFS_DIR2_SF_GET_OFFSET(sfep) + - XFS_DIR2_DATA_ENTSIZE(p.namelen)); - - p.ino = XFS_DIR2_SF_GET_INUMBER(sfp, XFS_DIR2_SF_INUMBERP(sfep)); -#if XFS_BIG_INUMS - p.ino += mp->m_inoadd; -#endif - p.name = (char *)sfep->name; - - error = p.put(&p); - - if (!p.done) { - uio->uio_offset = off; - return error; - } - } - - /* - * They all fit. - */ - *eofp = 1; - - uio->uio_offset = - XFS_DIR2_DB_OFF_TO_DATAPTR(mp, mp->m_dirdatablk + 1, 0); - - return 0; -} - /* * Lookup an entry in a shortform directory. * Returns EEXIST if found, ENOENT if not found. @@ -847,10 +721,14 @@ xfs_dir2_sf_lookup( { xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ + int error; xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + enum xfs_dacmp cmp; /* comparison result */ + xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */ + + trace_xfs_dir2_sf_lookup(args); - xfs_dir2_trace_args("sf_lookup", args); xfs_dir2_sf_check(args); dp = args->dp; @@ -864,13 +742,15 @@ xfs_dir2_sf_lookup( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Special case for . */ if (args->namelen == 1 && args->name[0] == '.') { args->inumber = dp->i_ino; + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* @@ -878,29 +758,43 @@ xfs_dir2_sf_lookup( */ if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { - args->inumber = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + args->inumber = dp->d_ops->sf_get_parent_ino(sfp); + args->cmpresult = XFS_CMP_EXACT; + args->filetype = XFS_DIR3_FT_DIR; return XFS_ERROR(EEXIST); } /* * Loop over all the entries trying to match ours. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { - args->inumber = - XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); - return XFS_ERROR(EEXIST); + ci_sfep = NULL; + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + /* + * Compare name and if it's an exact match, return the inode + * number. If it's the first case-insensitive match, store the + * inode number and continue looking for an exact match. + */ + cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name, + sfep->namelen); + if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) { + args->cmpresult = cmp; + args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); + args->filetype = dp->d_ops->sf_get_ftype(sfep); + if (cmp == XFS_CMP_EXACT) + return XFS_ERROR(EEXIST); + ci_sfep = sfep; } } + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); /* - * Didn't find it. + * Here, we can only be doing a lookup (not a rename or replace). + * If a case-insensitive match was not found, return ENOENT. */ - ASSERT(args->oknoent); - return XFS_ERROR(ENOENT); + if (!ci_sfep) + return XFS_ERROR(ENOENT); + /* otherwise process the CI match as required by the caller */ + error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); + return XFS_ERROR(error); } /* @@ -917,9 +811,10 @@ xfs_dir2_sf_removename( int newsize; /* new inode size */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_removename(args); - xfs_dir2_trace_args("sf_removename", args); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -933,35 +828,31 @@ xfs_dir2_sf_removename( } ASSERT(dp->i_df.if_bytes == oldsize); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsize >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count)); /* * Loop over the old directory entries. * Find the one we're deleting. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(sfep->name, args->name, args->namelen) == 0) { - ASSERT(XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)) == - args->inumber); + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { + ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) == + args->inumber); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) { + if (i == sfp->count) return XFS_ERROR(ENOENT); - } /* * Calculate sizes. */ byteoff = (int)((char *)sfep - (char *)sfp); - entsize = XFS_DIR2_SF_ENTSIZE_BYNAME(sfp, args->namelen); + entsize = dp->d_ops->sf_entsize(sfp, args->namelen); newsize = oldsize - entsize; /* * Copy the part if any after the removed entry, sliding it down. @@ -972,22 +863,22 @@ xfs_dir2_sf_removename( /* * Fix up the header and file size. */ - sfp->hdr.count--; + sfp->count--; dp->i_d.di_size = newsize; /* * Reallocate, making it smaller. */ xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; #if XFS_BIG_INUMS /* * Are we changing inode number size? */ if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) { - if (sfp->hdr.i8count == 1) + if (sfp->i8count == 1) xfs_dir2_sf_toino4(args); else - sfp->hdr.i8count--; + sfp->i8count--; } #endif xfs_dir2_sf_check(args); @@ -1011,9 +902,10 @@ xfs_dir2_sf_replace( int i8elevated; /* sf_toino8 set i8count=1 */ #endif xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ - xfs_dir2_sf_t *sfp; /* shortform structure */ + xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ + + trace_xfs_dir2_sf_replace(args); - xfs_dir2_trace_args("sf_replace", args); dp = args->dp; ASSERT(dp->i_df.if_flags & XFS_IFINLINE); @@ -1026,19 +918,19 @@ xfs_dir2_sf_replace( } ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); ASSERT(dp->i_df.if_u1.if_data != NULL); - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(dp->i_d.di_size >= XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count)); + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); #if XFS_BIG_INUMS /* * New inode number is large, and need to convert to 8-byte inodes. */ - if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->hdr.i8count == 0) { + if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) { int error; /* error return value */ int newsize; /* new inode size */ newsize = dp->i_df.if_bytes + - (sfp->hdr.count + 1) * + (sfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); /* @@ -1056,7 +948,7 @@ xfs_dir2_sf_replace( */ xfs_dir2_sf_toino8(args); i8elevated = 1; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; } else i8elevated = 0; #endif @@ -1067,36 +959,33 @@ xfs_dir2_sf_replace( if (args->namelen == 2 && args->name[0] == '.' && args->name[1] == '.') { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, &sfp->hdr.parent); + ino = dp->d_ops->sf_get_parent_ino(sfp); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, &sfp->hdr.parent); + dp->d_ops->sf_put_parent_ino(sfp, args->inumber); } /* * Normal entry, look for the name. */ else { - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep)) { - if (sfep->namelen == args->namelen && - sfep->name[0] == args->name[0] && - memcmp(args->name, sfep->name, args->namelen) == 0) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { + if (xfs_da_compname(args, sfep->name, sfep->namelen) == + XFS_CMP_EXACT) { #if XFS_BIG_INUMS || defined(DEBUG) - ino = XFS_DIR2_SF_GET_INUMBER(sfp, - XFS_DIR2_SF_INUMBERP(sfep)); + ino = dp->d_ops->sf_get_ino(sfp, sfep); ASSERT(args->inumber != ino); #endif - XFS_DIR2_SF_PUT_INUMBER(sfp, &args->inumber, - XFS_DIR2_SF_INUMBERP(sfep)); + dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); + dp->d_ops->sf_put_ftype(sfep, args->filetype); break; } } /* * Didn't find it. */ - if (i == sfp->hdr.count) { - ASSERT(args->oknoent); + if (i == sfp->count) { + ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); #if XFS_BIG_INUMS if (i8elevated) xfs_dir2_sf_toino4(args); @@ -1113,10 +1002,10 @@ xfs_dir2_sf_replace( /* * And the old count was one, so need to convert to small. */ - if (sfp->hdr.i8count == 1) + if (sfp->i8count == 1) xfs_dir2_sf_toino4(args); else - sfp->hdr.i8count--; + sfp->i8count--; } /* * See if the old number was small, the new number is large. @@ -1127,9 +1016,9 @@ xfs_dir2_sf_replace( * add to the i8count unless we just converted to 8-byte * inodes (which does an implied i8count = 1) */ - ASSERT(sfp->hdr.i8count != 0); + ASSERT(sfp->i8count != 0); if (!i8elevated) - sfp->hdr.i8count++; + sfp->i8count++; } #endif xfs_dir2_sf_check(args); @@ -1149,16 +1038,18 @@ xfs_dir2_sf_toino4( char *buf; /* old dir's buffer */ xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ - xfs_ino_t ino; /* entry inode number */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_t *oldsfp; /* old sf directory */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ - xfs_dir2_sf_t *sfp; /* new sf directory */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; + + trace_xfs_dir2_sf_toino4(args); - xfs_dir2_trace_args("sf_toino4", args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1167,57 +1058,56 @@ xfs_dir2_sf_toino4( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsfp->hdr.i8count == 1); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 1); memcpy(buf, oldsfp, oldsize); /* * Compute the new inode size. */ newsize = oldsize - - (oldsfp->hdr.count + 1) * + (oldsfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* * Reset our pointers, the data has moved. */ - oldsfp = (xfs_dir2_sf_t *)buf; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Fill in the new header. */ - sfp->hdr.count = oldsfp->hdr.count; - sfp->hdr.i8count = 0; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + sfp->count = oldsfp->count; + sfp->i8count = 0; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } /* - * Convert from 4-byte inode numbers to 8-byte inode numbers. - * The new 8-byte inode number is not there yet, we leave with the - * count 1 but no corresponding entry. + * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers. + * The new entry w/ an 8-byte inode number is not there yet; we leave with + * i8count set to 1, but no corresponding 8-byte entry. */ static void xfs_dir2_sf_toino8( @@ -1226,16 +1116,18 @@ xfs_dir2_sf_toino8( char *buf; /* old dir's buffer */ xfs_inode_t *dp; /* incore directory inode */ int i; /* entry index */ - xfs_ino_t ino; /* entry inode number */ int newsize; /* new inode size */ xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */ - xfs_dir2_sf_t *oldsfp; /* old sf directory */ + xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */ int oldsize; /* old inode size */ xfs_dir2_sf_entry_t *sfep; /* new sf entry */ - xfs_dir2_sf_t *sfp; /* new sf directory */ + xfs_dir2_sf_hdr_t *sfp; /* new sf directory */ + struct xfs_mount *mp; + + trace_xfs_dir2_sf_toino8(args); - xfs_dir2_trace_args("sf_toino8", args); dp = args->dp; + mp = dp->i_mount; /* * Copy the old directory to the buffer. @@ -1244,49 +1136,48 @@ xfs_dir2_sf_toino8( */ oldsize = dp->i_df.if_bytes; buf = kmem_alloc(oldsize, KM_SLEEP); - oldsfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; - ASSERT(oldsfp->hdr.i8count == 0); + oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + ASSERT(oldsfp->i8count == 0); memcpy(buf, oldsfp, oldsize); /* - * Compute the new inode size. + * Compute the new inode size (nb: entry count + 1 for parent) */ newsize = oldsize + - (oldsfp->hdr.count + 1) * + (oldsfp->count + 1) * ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t)); xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); /* * Reset our pointers, the data has moved. */ - oldsfp = (xfs_dir2_sf_t *)buf; - sfp = (xfs_dir2_sf_t *)dp->i_df.if_u1.if_data; + oldsfp = (xfs_dir2_sf_hdr_t *)buf; + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; /* * Fill in the new header. */ - sfp->hdr.count = oldsfp->hdr.count; - sfp->hdr.i8count = 1; - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, &oldsfp->hdr.parent); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, &sfp->hdr.parent); + sfp->count = oldsfp->count; + sfp->i8count = 1; + dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp)); /* * Copy the entries field by field. */ - for (i = 0, sfep = XFS_DIR2_SF_FIRSTENTRY(sfp), - oldsfep = XFS_DIR2_SF_FIRSTENTRY(oldsfp); - i < sfp->hdr.count; - i++, sfep = XFS_DIR2_SF_NEXTENTRY(sfp, sfep), - oldsfep = XFS_DIR2_SF_NEXTENTRY(oldsfp, oldsfep)) { + for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp), + oldsfep = xfs_dir2_sf_firstentry(oldsfp); + i < sfp->count; + i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), + oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { sfep->namelen = oldsfep->namelen; sfep->offset = oldsfep->offset; memcpy(sfep->name, oldsfep->name, sfep->namelen); - ino = XFS_DIR2_SF_GET_INUMBER(oldsfp, - XFS_DIR2_SF_INUMBERP(oldsfep)); - XFS_DIR2_SF_PUT_INUMBER(sfp, &ino, XFS_DIR2_SF_INUMBERP(sfep)); + dp->d_ops->sf_put_ino(sfp, sfep, + dp->d_ops->sf_get_ino(oldsfp, oldsfep)); + dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep)); } /* * Clean up the inode. */ - kmem_free(buf, oldsize); + kmem_free(buf); dp->i_d.di_size = newsize; xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); } diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h deleted file mode 100644 index 42f015b7001..00000000000 --- a/fs/xfs/xfs_dir2_sf.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2000-2001,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_SF_H__ -#define __XFS_DIR2_SF_H__ - -/* - * Directory layout when stored internal to an inode. - * - * Small directories are packed as tightly as possible so as to - * fit into the literal area of the inode. - */ - -struct uio; -struct xfs_dabuf; -struct xfs_da_args; -struct xfs_dir2_block; -struct xfs_inode; -struct xfs_mount; -struct xfs_trans; - -/* - * Maximum size of a shortform directory. - */ -#define XFS_DIR2_SF_MAX_SIZE \ - (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \ - (uint)sizeof(xfs_agino_t)) - -/* - * Inode number stored as 8 8-bit values. - */ -typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t; - -/* - * Inode number stored as 4 8-bit values. - * Works a lot of the time, when all the inode numbers in a directory - * fit in 32 bits. - */ -typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t; - -typedef union { - xfs_dir2_ino8_t i8; - xfs_dir2_ino4_t i4; -} xfs_dir2_inou_t; -#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) - -/* - * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t. - * Only need 16 bits, this is the byte offset into the single block form. - */ -typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t; - -/* - * The parent directory has a dedicated field, and the self-pointer must - * be calculated on the fly. - * - * Entries are packed toward the top as tightly as possible. The header - * and the elements must be memcpy'd out into a work area to get correct - * alignment for the inode number fields. - */ -typedef struct xfs_dir2_sf_hdr { - __uint8_t count; /* count of entries */ - __uint8_t i8count; /* count of 8-byte inode #s */ - xfs_dir2_inou_t parent; /* parent dir inode number */ -} xfs_dir2_sf_hdr_t; - -typedef struct xfs_dir2_sf_entry { - __uint8_t namelen; /* actual name length */ - xfs_dir2_sf_off_t offset; /* saved offset */ - __uint8_t name[1]; /* name, variable size */ - xfs_dir2_inou_t inumber; /* inode number, var. offset */ -} xfs_dir2_sf_entry_t; - -typedef struct xfs_dir2_sf { - xfs_dir2_sf_hdr_t hdr; /* shortform header */ - xfs_dir2_sf_entry_t list[1]; /* shortform entries */ -} xfs_dir2_sf_t; - -#define XFS_DIR2_SF_HDR_SIZE(i8count) xfs_dir2_sf_hdr_size(i8count) -static inline int xfs_dir2_sf_hdr_size(int i8count) -{ - return ((uint)sizeof(xfs_dir2_sf_hdr_t) - \ - ((i8count) == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -#define XFS_DIR2_SF_INUMBERP(sfep) xfs_dir2_sf_inumberp(sfep) -static inline xfs_dir2_inou_t *xfs_dir2_sf_inumberp(xfs_dir2_sf_entry_t *sfep) -{ - return (xfs_dir2_inou_t *)&(sfep)->name[(sfep)->namelen]; -} - -#define XFS_DIR2_SF_GET_INUMBER(sfp, from) \ - xfs_dir2_sf_get_inumber(sfp, from) -static inline xfs_intino_t -xfs_dir2_sf_get_inumber(xfs_dir2_sf_t *sfp, xfs_dir2_inou_t *from) -{ - return ((sfp)->hdr.i8count == 0 ? \ - (xfs_intino_t)XFS_GET_DIR_INO4((from)->i4) : \ - (xfs_intino_t)XFS_GET_DIR_INO8((from)->i8)); -} - -#define XFS_DIR2_SF_PUT_INUMBER(sfp,from,to) \ - xfs_dir2_sf_put_inumber(sfp,from,to) -static inline void xfs_dir2_sf_put_inumber(xfs_dir2_sf_t *sfp, xfs_ino_t *from, - xfs_dir2_inou_t *to) -{ - if ((sfp)->hdr.i8count == 0) - XFS_PUT_DIR_INO4(*(from), (to)->i4); - else - XFS_PUT_DIR_INO8(*(from), (to)->i8); -} - -#define XFS_DIR2_SF_GET_OFFSET(sfep) \ - xfs_dir2_sf_get_offset(sfep) -static inline xfs_dir2_data_aoff_t -xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) -{ - return INT_GET_UNALIGNED_16_BE(&(sfep)->offset.i); -} - -#define XFS_DIR2_SF_PUT_OFFSET(sfep,off) \ - xfs_dir2_sf_put_offset(sfep,off) -static inline void -xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) -{ - INT_SET_UNALIGNED_16_BE(&(sfep)->offset.i, off); -} - -#define XFS_DIR2_SF_ENTSIZE_BYNAME(sfp,len) \ - xfs_dir2_sf_entsize_byname(sfp,len) -static inline int xfs_dir2_sf_entsize_byname(xfs_dir2_sf_t *sfp, int len) -{ - return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (len) - \ - ((sfp)->hdr.i8count == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -#define XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep) \ - xfs_dir2_sf_entsize_byentry(sfp,sfep) -static inline int -xfs_dir2_sf_entsize_byentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) -{ - return ((uint)sizeof(xfs_dir2_sf_entry_t) - 1 + (sfep)->namelen - \ - ((sfp)->hdr.i8count == 0) * \ - ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t))); -} - -#define XFS_DIR2_SF_FIRSTENTRY(sfp) xfs_dir2_sf_firstentry(sfp) -static inline xfs_dir2_sf_entry_t *xfs_dir2_sf_firstentry(xfs_dir2_sf_t *sfp) -{ - return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfp) + XFS_DIR2_SF_HDR_SIZE(sfp->hdr.i8count))); -} - -#define XFS_DIR2_SF_NEXTENTRY(sfp,sfep) xfs_dir2_sf_nextentry(sfp,sfep) -static inline xfs_dir2_sf_entry_t * -xfs_dir2_sf_nextentry(xfs_dir2_sf_t *sfp, xfs_dir2_sf_entry_t *sfep) -{ - return ((xfs_dir2_sf_entry_t *) \ - ((char *)(sfep) + XFS_DIR2_SF_ENTSIZE_BYENTRY(sfp,sfep))); -} - -/* - * Functions. - */ -extern int xfs_dir2_block_sfsize(struct xfs_inode *dp, - struct xfs_dir2_block *block, - xfs_dir2_sf_hdr_t *sfhp); -extern int xfs_dir2_block_to_sf(struct xfs_da_args *args, struct xfs_dabuf *bp, - int size, xfs_dir2_sf_hdr_t *sfhp); -extern int xfs_dir2_sf_addname(struct xfs_da_args *args); -extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino); -extern int xfs_dir2_sf_getdents(struct xfs_inode *dp, struct uio *uio, - int *eofp, struct xfs_dirent *dbp, - xfs_dir2_put_t put); -extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); -extern int xfs_dir2_sf_removename(struct xfs_da_args *args); -extern int xfs_dir2_sf_replace(struct xfs_da_args *args); - -#endif /* __XFS_DIR2_SF_H__ */ diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c deleted file mode 100644 index f3fb2ffd6f5..00000000000 --- a/fs/xfs/xfs_dir2_trace.c +++ /dev/null @@ -1,214 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_inum.h" -#include "xfs_dir2.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_dir2_trace.h" - -#ifdef XFS_DIR2_TRACE -ktrace_t *xfs_dir2_trace_buf; - -/* - * Enter something in the trace buffers. - */ -static void -xfs_dir2_trace_enter( - xfs_inode_t *dp, - int type, - char *where, - char *name, - int namelen, - void *a0, - void *a1, - void *a2, - void *a3, - void *a4, - void *a5, - void *a6, - void *a7) -{ - void *n[5]; - - ASSERT(xfs_dir2_trace_buf); - ASSERT(dp->i_dir_trace); - if (name) - memcpy(n, name, min((int)sizeof(n), namelen)); - else - memset((char *)n, 0, sizeof(n)); - ktrace_enter(xfs_dir2_trace_buf, - (void *)(long)type, (void *)where, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)(long)namelen, - (void *)n[0], (void *)n[1], (void *)n[2], - (void *)n[3], (void *)n[4]); - ktrace_enter(dp->i_dir_trace, - (void *)(long)type, (void *)where, - (void *)a0, (void *)a1, (void *)a2, (void *)a3, - (void *)a4, (void *)a5, (void *)a6, (void *)a7, - (void *)(long)namelen, - (void *)n[0], (void *)n[1], (void *)n[2], - (void *)n[3], (void *)n[4]); -} - -void -xfs_dir2_trace_args( - char *where, - xfs_da_args_t *args) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, NULL, NULL); -} - -void -xfs_dir2_trace_args_b( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *bp) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_B, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, - (void *)(bp ? bp->bps[0] : NULL), NULL); -} - -void -xfs_dir2_trace_args_bb( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *lbp, - xfs_dabuf_t *dbp) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, - (void *)(lbp ? lbp->bps[0] : NULL), - (void *)(dbp ? dbp->bps[0] : NULL)); -} - -void -xfs_dir2_trace_args_bibii( - char *where, - xfs_da_args_t *args, - xfs_dabuf_t *bs, - int ss, - xfs_dabuf_t *bd, - int sd, - int c) -{ - xfs_buf_t *bpbs = bs ? bs->bps[0] : NULL; - xfs_buf_t *bpbd = bd ? bd->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_BIBII, where, - (char *)args->name, (int)args->namelen, - (void *)args->dp, (void *)args->trans, - (void *)bpbs, (void *)(long)ss, (void *)bpbd, (void *)(long)sd, - (void *)(long)c, NULL); -} - -void -xfs_dir2_trace_args_db( - char *where, - xfs_da_args_t *args, - xfs_dir2_db_t db, - xfs_dabuf_t *bp) -{ - xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_DB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)db, - (void *)dbp); -} - -void -xfs_dir2_trace_args_i( - char *where, - xfs_da_args_t *args, - xfs_ino_t i) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_I, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, - (void *)((unsigned long)(i >> 32)), - (void *)((unsigned long)(i & 0xFFFFFFFF))); -} - -void -xfs_dir2_trace_args_s( - char *where, - xfs_da_args_t *args, - int s) -{ - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_S, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL); -} - -void -xfs_dir2_trace_args_sb( - char *where, - xfs_da_args_t *args, - int s, - xfs_dabuf_t *bp) -{ - xfs_buf_t *dbp = bp ? bp->bps[0] : NULL; - - xfs_dir2_trace_enter(args->dp, XFS_DIR2_KTRACE_ARGS_SB, where, - (char *)args->name, (int)args->namelen, - (void *)(unsigned long)args->hashval, - (void *)((unsigned long)(args->inumber >> 32)), - (void *)((unsigned long)(args->inumber & 0xFFFFFFFF)), - (void *)args->dp, (void *)args->trans, - (void *)(unsigned long)args->justcheck, (void *)(long)s, - (void *)dbp); -} -#endif /* XFS_DIR2_TRACE */ diff --git a/fs/xfs/xfs_dir2_trace.h b/fs/xfs/xfs_dir2_trace.h deleted file mode 100644 index ca3c754f482..00000000000 --- a/fs/xfs/xfs_dir2_trace.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DIR2_TRACE_H__ -#define __XFS_DIR2_TRACE_H__ - -/* - * Tracing for xfs v2 directories. - */ - -#if defined(XFS_DIR2_TRACE) - -struct ktrace; -struct xfs_dabuf; -struct xfs_da_args; - -#define XFS_DIR2_GTRACE_SIZE 4096 /* global buffer */ -#define XFS_DIR2_KTRACE_SIZE 32 /* per-inode buffer */ -extern struct ktrace *xfs_dir2_trace_buf; - -#define XFS_DIR2_KTRACE_ARGS 1 /* args only */ -#define XFS_DIR2_KTRACE_ARGS_B 2 /* args + buffer */ -#define XFS_DIR2_KTRACE_ARGS_BB 3 /* args + 2 buffers */ -#define XFS_DIR2_KTRACE_ARGS_DB 4 /* args, db, buffer */ -#define XFS_DIR2_KTRACE_ARGS_I 5 /* args, inum */ -#define XFS_DIR2_KTRACE_ARGS_S 6 /* args, int */ -#define XFS_DIR2_KTRACE_ARGS_SB 7 /* args, int, buffer */ -#define XFS_DIR2_KTRACE_ARGS_BIBII 8 /* args, buf/int/buf/int/int */ - -void xfs_dir2_trace_args(char *where, struct xfs_da_args *args); -void xfs_dir2_trace_args_b(char *where, struct xfs_da_args *args, - struct xfs_dabuf *bp); -void xfs_dir2_trace_args_bb(char *where, struct xfs_da_args *args, - struct xfs_dabuf *lbp, struct xfs_dabuf *dbp); -void xfs_dir2_trace_args_bibii(char *where, struct xfs_da_args *args, - struct xfs_dabuf *bs, int ss, - struct xfs_dabuf *bd, int sd, int c); -void xfs_dir2_trace_args_db(char *where, struct xfs_da_args *args, - xfs_dir2_db_t db, struct xfs_dabuf *bp); -void xfs_dir2_trace_args_i(char *where, struct xfs_da_args *args, xfs_ino_t i); -void xfs_dir2_trace_args_s(char *where, struct xfs_da_args *args, int s); -void xfs_dir2_trace_args_sb(char *where, struct xfs_da_args *args, int s, - struct xfs_dabuf *bp); - -#else /* XFS_DIR2_TRACE */ - -#define xfs_dir2_trace_args(where, args) -#define xfs_dir2_trace_args_b(where, args, bp) -#define xfs_dir2_trace_args_bb(where, args, lbp, dbp) -#define xfs_dir2_trace_args_bibii(where, args, bs, ss, bd, sd, c) -#define xfs_dir2_trace_args_db(where, args, db, bp) -#define xfs_dir2_trace_args_i(where, args, i) -#define xfs_dir2_trace_args_s(where, args, s) -#define xfs_dir2_trace_args_sb(where, args, s, bp) - -#endif /* XFS_DIR2_TRACE */ - -#endif /* __XFS_DIR2_TRACE_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c new file mode 100644 index 00000000000..4f11ef01113 --- /dev/null +++ b/fs/xfs/xfs_discard.c @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_quota.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_extent_busy.h" +#include "xfs_discard.h" +#include "xfs_trace.h" +#include "xfs_log.h" + +STATIC int +xfs_trim_extents( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + __uint64_t *blocks_trimmed) +{ + struct block_device *bdev = mp->m_ddev_targp->bt_bdev; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + struct xfs_perag *pag; + int error; + int i; + + pag = xfs_perag_get(mp, agno); + + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); + if (error || !agbp) + goto out_put_perag; + + cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT); + + /* + * Force out the log. This means any transactions that might have freed + * space before we took the AGF buffer lock are now on disk, and the + * volatile disk cache is flushed. + */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * Look up the longest btree in the AGF and start with it. + */ + error = xfs_alloc_lookup_ge(cur, 0, + be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i); + if (error) + goto out_del_cursor; + + /* + * Loop until we are done with all extents that are large + * enough to be worth discarding. + */ + while (i) { + xfs_agblock_t fbno; + xfs_extlen_t flen; + xfs_daddr_t dbno; + xfs_extlen_t dlen; + + error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); + if (error) + goto out_del_cursor; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); + + /* + * use daddr format for all range/len calculations as that is + * the format the range/len variables are supplied in by + * userspace. + */ + dbno = XFS_AGB_TO_DADDR(mp, agno, fbno); + dlen = XFS_FSB_TO_BB(mp, flen); + + /* + * Too small? Give up. + */ + if (dlen < minlen) { + trace_xfs_discard_toosmall(mp, agno, fbno, flen); + goto out_del_cursor; + } + + /* + * If the extent is entirely outside of the range we are + * supposed to discard skip it. Do not bother to trim + * down partially overlapping ranges for now. + */ + if (dbno + dlen < start || dbno > end) { + trace_xfs_discard_exclude(mp, agno, fbno, flen); + goto next_extent; + } + + /* + * If any blocks in the range are still busy, skip the + * discard and try again the next time. + */ + if (xfs_extent_busy_search(mp, agno, fbno, flen)) { + trace_xfs_discard_busy(mp, agno, fbno, flen); + goto next_extent; + } + + trace_xfs_discard_extent(mp, agno, fbno, flen); + error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); + if (error) + goto out_del_cursor; + *blocks_trimmed += flen; + +next_extent: + error = xfs_btree_decrement(cur, 0, &i); + if (error) + goto out_del_cursor; + } + +out_del_cursor: + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); +out_put_perag: + xfs_perag_put(pag); + return error; +} + +/* + * trim a range of the filesystem. + * + * Note: the parameters passed from userspace are byte ranges into the + * filesystem which does not match to the format we use for filesystem block + * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format + * is a linear address range. Hence we need to use DADDR based conversions and + * comparisons for determining the correct offset and regions to trim. + */ +int +xfs_ioc_trim( + struct xfs_mount *mp, + struct fstrim_range __user *urange) +{ + struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev); + unsigned int granularity = q->limits.discard_granularity; + struct fstrim_range range; + xfs_daddr_t start, end, minlen; + xfs_agnumber_t start_agno, end_agno, agno; + __uint64_t blocks_trimmed = 0; + int error, last_error = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (!blk_queue_discard(q)) + return -XFS_ERROR(EOPNOTSUPP); + if (copy_from_user(&range, urange, sizeof(range))) + return -XFS_ERROR(EFAULT); + + /* + * Truncating down the len isn't actually quite correct, but using + * BBTOB would mean we trivially get overflows for values + * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default + * used by the fstrim application. In the end it really doesn't + * matter as trimming blocks is an advisory interface. + */ + if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || + range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || + range.len < mp->m_sb.sb_blocksize) + return -XFS_ERROR(EINVAL); + + start = BTOBB(range.start); + end = start + BTOBBT(range.len) - 1; + minlen = BTOBB(max_t(u64, granularity, range.minlen)); + + if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1) + end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1; + + start_agno = xfs_daddr_to_agno(mp, start); + end_agno = xfs_daddr_to_agno(mp, end); + + for (agno = start_agno; agno <= end_agno; agno++) { + error = -xfs_trim_extents(mp, agno, start, end, minlen, + &blocks_trimmed); + if (error) + last_error = error; + } + + if (last_error) + return last_error; + + range.len = XFS_FSB_TO_B(mp, blocks_trimmed); + if (copy_to_user(urange, &range, sizeof(range))) + return -XFS_ERROR(EFAULT); + return 0; +} + +int +xfs_discard_extents( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_extent_busy *busyp; + int error = 0; + + list_for_each_entry(busyp, list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, 0); + if (error && error != EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llu,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + return error; + } + } + + return 0; +} diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h new file mode 100644 index 00000000000..344879aea64 --- /dev/null +++ b/fs/xfs/xfs_discard.h @@ -0,0 +1,10 @@ +#ifndef XFS_DISCARD_H +#define XFS_DISCARD_H 1 + +struct fstrim_range; +struct list_head; + +extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +extern int xfs_discard_extents(struct xfs_mount *, struct list_head *); + +#endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h deleted file mode 100644 index 4e7865ad6f0..00000000000 --- a/fs/xfs/xfs_dmapi.h +++ /dev/null @@ -1,194 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_DMAPI_H__ -#define __XFS_DMAPI_H__ - -#include <linux/version.h> -/* Values used to define the on-disk version of dm_attrname_t. All - * on-disk attribute names start with the 8-byte string "SGI_DMI_". - * - * In the on-disk inode, DMAPI attribute names consist of the user-provided - * name with the DMATTR_PREFIXSTRING pre-pended. This string must NEVER be - * changed. - */ - -#define DMATTR_PREFIXLEN 8 -#define DMATTR_PREFIXSTRING "SGI_DMI_" - -typedef enum { - DM_EVENT_INVALID = -1, - DM_EVENT_CANCEL = 0, /* not supported */ - DM_EVENT_MOUNT = 1, - DM_EVENT_PREUNMOUNT = 2, - DM_EVENT_UNMOUNT = 3, - DM_EVENT_DEBUT = 4, /* not supported */ - DM_EVENT_CREATE = 5, - DM_EVENT_CLOSE = 6, /* not supported */ - DM_EVENT_POSTCREATE = 7, - DM_EVENT_REMOVE = 8, - DM_EVENT_POSTREMOVE = 9, - DM_EVENT_RENAME = 10, - DM_EVENT_POSTRENAME = 11, - DM_EVENT_LINK = 12, - DM_EVENT_POSTLINK = 13, - DM_EVENT_SYMLINK = 14, - DM_EVENT_POSTSYMLINK = 15, - DM_EVENT_READ = 16, - DM_EVENT_WRITE = 17, - DM_EVENT_TRUNCATE = 18, - DM_EVENT_ATTRIBUTE = 19, - DM_EVENT_DESTROY = 20, - DM_EVENT_NOSPACE = 21, - DM_EVENT_USER = 22, - DM_EVENT_MAX = 23 -} dm_eventtype_t; -#define HAVE_DM_EVENTTYPE_T - -typedef enum { - DM_RIGHT_NULL, - DM_RIGHT_SHARED, - DM_RIGHT_EXCL -} dm_right_t; -#define HAVE_DM_RIGHT_T - -/* Defines for determining if an event message should be sent. */ -#define DM_EVENT_ENABLED(vfsp, ip, event) ( \ - unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ - ( ((ip)->i_d.di_dmevmask & (1 << event)) || \ - ((ip)->i_mount->m_dmevmask & (1 << event)) ) \ - ) - -#define DM_EVENT_ENABLED_IO(vfsp, io, event) ( \ - unlikely ((vfsp)->vfs_flag & VFS_DMI) && \ - ( ((io)->io_dmevmask & (1 << event)) || \ - ((io)->io_mount->m_dmevmask & (1 << event)) ) \ - ) - -#define DM_XFS_VALID_FS_EVENTS ( \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_DEBUT) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a regular file or a symlink. These events are persistent. -*/ - -#define DM_XFS_VALID_FILE_EVENTS ( \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events valid in dm_set_eventlist() when called with a file handle for - a directory. These events are persistent. -*/ - -#define DM_XFS_VALID_DIRECTORY_EVENTS ( \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - -/* Events supported by the XFS filesystem. */ -#define DM_XFS_SUPPORTED_EVENTS ( \ - (1 << DM_EVENT_MOUNT) | \ - (1 << DM_EVENT_PREUNMOUNT) | \ - (1 << DM_EVENT_UNMOUNT) | \ - (1 << DM_EVENT_NOSPACE) | \ - (1 << DM_EVENT_CREATE) | \ - (1 << DM_EVENT_POSTCREATE) | \ - (1 << DM_EVENT_REMOVE) | \ - (1 << DM_EVENT_POSTREMOVE) | \ - (1 << DM_EVENT_RENAME) | \ - (1 << DM_EVENT_POSTRENAME) | \ - (1 << DM_EVENT_LINK) | \ - (1 << DM_EVENT_POSTLINK) | \ - (1 << DM_EVENT_SYMLINK) | \ - (1 << DM_EVENT_POSTSYMLINK) | \ - (1 << DM_EVENT_READ) | \ - (1 << DM_EVENT_WRITE) | \ - (1 << DM_EVENT_TRUNCATE) | \ - (1 << DM_EVENT_ATTRIBUTE) | \ - (1 << DM_EVENT_DESTROY) ) - - -/* - * Definitions used for the flags field on dm_send_*_event(). - */ - -#define DM_FLAGS_NDELAY 0x001 /* return EAGAIN after dm_pending() */ -#define DM_FLAGS_UNWANTED 0x002 /* event not in fsys dm_eventset_t */ -#define DM_FLAGS_IMUX 0x004 /* thread holds i_mutex */ -#define DM_FLAGS_IALLOCSEM_RD 0x010 /* thread holds i_alloc_sem rd */ -#define DM_FLAGS_IALLOCSEM_WR 0x020 /* thread holds i_alloc_sem wr */ - -/* - * Based on IO_ISDIRECT, decide which i_ flag is set. - */ -#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IMUX : 0) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) -#endif - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) && \ - (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,22)) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - DM_FLAGS_IALLOCSEM_RD : DM_FLAGS_IMUX) -#define DM_SEM_FLAG_WR (DM_FLAGS_IALLOCSEM_WR | DM_FLAGS_IMUX) -#endif - -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,21) -#define DM_SEM_FLAG_RD(ioflags) (((ioflags) & IO_ISDIRECT) ? \ - 0 : DM_FLAGS_IMUX) -#define DM_SEM_FLAG_WR (DM_FLAGS_IMUX) -#endif - - -/* - * Macros to turn caller specified delay/block flags into - * dm_send_xxxx_event flag DM_FLAGS_NDELAY. - */ - -#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \ - DM_FLAGS_NDELAY : 0) -#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0) - - -extern struct bhv_module_vfsops xfs_dmops; - -#endif /* __XFS_DMAPI_H__ */ diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c new file mode 100644 index 00000000000..3ee0cd43edc --- /dev/null +++ b/fs/xfs/xfs_dquot.c @@ -0,0 +1,1105 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_space.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" + +/* + * Lock order: + * + * ip->i_lock + * qi->qi_tree_lock + * dquot->q_qlock (xfs_dqlock() and friends) + * dquot->q_flush (xfs_dqflock() and friends) + * qi->qi_lru_lock + * + * If two dquots need to be locked the order is user before group/project, + * otherwise by the lowest id first, see xfs_dqlock2. + */ + +#ifdef DEBUG +xfs_buftarg_t *xfs_dqerror_target; +int xfs_do_dqerror; +int xfs_dqreq_num; +int xfs_dqerror_mod = 33; +#endif + +struct kmem_zone *xfs_qm_dqtrxzone; +static struct kmem_zone *xfs_qm_dqzone; + +static struct lock_class_key xfs_dquot_group_class; +static struct lock_class_key xfs_dquot_project_class; + +/* + * This is called to free all the memory associated with a dquot + */ +void +xfs_qm_dqdestroy( + xfs_dquot_t *dqp) +{ + ASSERT(list_empty(&dqp->q_lru)); + + mutex_destroy(&dqp->q_qlock); + kmem_zone_free(xfs_qm_dqzone, dqp); + + XFS_STATS_DEC(xs_qm_dquot); +} + +/* + * If default limits are in force, push them into the dquot now. + * We overwrite the dquot limits only if they are zero and this + * is not the root dquot. + */ +void +xfs_qm_adjust_dqlimits( + struct xfs_mount *mp, + struct xfs_dquot *dq) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *d = &dq->q_core; + int prealloc = 0; + + ASSERT(d->d_id); + + if (q->qi_bsoftlimit && !d->d_blk_softlimit) { + d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit); + prealloc = 1; + } + if (q->qi_bhardlimit && !d->d_blk_hardlimit) { + d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit); + prealloc = 1; + } + if (q->qi_isoftlimit && !d->d_ino_softlimit) + d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit); + if (q->qi_ihardlimit && !d->d_ino_hardlimit) + d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit); + if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit) + d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit); + if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit) + d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit); + + if (prealloc) + xfs_dquot_set_prealloc_limits(dq); +} + +/* + * Check the limits and timers of a dquot and start or reset timers + * if necessary. + * This gets called even when quota enforcement is OFF, which makes our + * life a little less complicated. (We just don't reject any quota + * reservations in that case, when enforcement is off). + * We also return 0 as the values of the timers in Q_GETQUOTA calls, when + * enforcement's off. + * In contrast, warnings are a little different in that they don't + * 'automatically' get started when limits get exceeded. They do + * get reset to zero, however, when we find the count to be under + * the soft limit (they are only ever set non-zero via userspace). + */ +void +xfs_qm_adjust_dqtimers( + xfs_mount_t *mp, + xfs_disk_dquot_t *d) +{ + ASSERT(d->d_id); + +#ifdef DEBUG + if (d->d_blk_hardlimit) + ASSERT(be64_to_cpu(d->d_blk_softlimit) <= + be64_to_cpu(d->d_blk_hardlimit)); + if (d->d_ino_hardlimit) + ASSERT(be64_to_cpu(d->d_ino_softlimit) <= + be64_to_cpu(d->d_ino_hardlimit)); + if (d->d_rtb_hardlimit) + ASSERT(be64_to_cpu(d->d_rtb_softlimit) <= + be64_to_cpu(d->d_rtb_hardlimit)); +#endif + + if (!d->d_btimer) { + if ((d->d_blk_softlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_softlimit))) || + (d->d_blk_hardlimit && + (be64_to_cpu(d->d_bcount) > + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_btimelimit); + } else { + d->d_bwarns = 0; + } + } else { + if ((!d->d_blk_softlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_softlimit))) && + (!d->d_blk_hardlimit || + (be64_to_cpu(d->d_bcount) <= + be64_to_cpu(d->d_blk_hardlimit)))) { + d->d_btimer = 0; + } + } + + if (!d->d_itimer) { + if ((d->d_ino_softlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_softlimit))) || + (d->d_ino_hardlimit && + (be64_to_cpu(d->d_icount) > + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_itimelimit); + } else { + d->d_iwarns = 0; + } + } else { + if ((!d->d_ino_softlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_softlimit))) && + (!d->d_ino_hardlimit || + (be64_to_cpu(d->d_icount) <= + be64_to_cpu(d->d_ino_hardlimit)))) { + d->d_itimer = 0; + } + } + + if (!d->d_rtbtimer) { + if ((d->d_rtb_softlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_softlimit))) || + (d->d_rtb_hardlimit && + (be64_to_cpu(d->d_rtbcount) > + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = cpu_to_be32(get_seconds() + + mp->m_quotainfo->qi_rtbtimelimit); + } else { + d->d_rtbwarns = 0; + } + } else { + if ((!d->d_rtb_softlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_softlimit))) && + (!d->d_rtb_hardlimit || + (be64_to_cpu(d->d_rtbcount) <= + be64_to_cpu(d->d_rtb_hardlimit)))) { + d->d_rtbtimer = 0; + } + } +} + +/* + * initialize a buffer full of dquots and log the whole thing + */ +STATIC void +xfs_qm_init_dquot_blk( + xfs_trans_t *tp, + xfs_mount_t *mp, + xfs_dqid_t id, + uint type, + xfs_buf_t *bp) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + xfs_dqblk_t *d; + int curid, i; + + ASSERT(tp); + ASSERT(xfs_buf_islocked(bp)); + + d = bp->b_addr; + + /* + * ID of the first dquot in the block - id's are zero based. + */ + curid = id - (id % q->qi_dqperchunk); + ASSERT(curid >= 0); + memset(d, 0, BBTOB(q->qi_dqchunklen)); + for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) { + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_id = cpu_to_be32(curid); + d->dd_diskdq.d_flags = type; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } + + xfs_trans_dquot_buf(tp, bp, + (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF : + ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF : + XFS_BLF_GDQUOT_BUF))); + xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); +} + +/* + * Initialize the dynamic speculative preallocation thresholds. The lo/hi + * watermarks correspond to the soft and hard limits by default. If a soft limit + * is not specified, we use 95% of the hard limit. + */ +void +xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) +{ + __uint64_t space; + + dqp->q_prealloc_hi_wmark = be64_to_cpu(dqp->q_core.d_blk_hardlimit); + dqp->q_prealloc_lo_wmark = be64_to_cpu(dqp->q_core.d_blk_softlimit); + if (!dqp->q_prealloc_lo_wmark) { + dqp->q_prealloc_lo_wmark = dqp->q_prealloc_hi_wmark; + do_div(dqp->q_prealloc_lo_wmark, 100); + dqp->q_prealloc_lo_wmark *= 95; + } + + space = dqp->q_prealloc_hi_wmark; + + do_div(space, 100); + dqp->q_low_space[XFS_QLOWSP_1_PCNT] = space; + dqp->q_low_space[XFS_QLOWSP_3_PCNT] = space * 3; + dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; +} + +/* + * Allocate a block and fill it with dquots. + * This is called when the bmapi finds a hole. + */ +STATIC int +xfs_qm_dqalloc( + xfs_trans_t **tpp, + xfs_mount_t *mp, + xfs_dquot_t *dqp, + xfs_inode_t *quotip, + xfs_fileoff_t offset_fsb, + xfs_buf_t **O_bpp) +{ + xfs_fsblock_t firstblock; + xfs_bmap_free_t flist; + xfs_bmbt_irec_t map; + int nmaps, error, committed; + xfs_buf_t *bp; + xfs_trans_t *tp = *tpp; + + ASSERT(tp != NULL); + + trace_xfs_dqalloc(dqp); + + /* + * Initialize the bmap freelist prior to calling bmapi code. + */ + xfs_bmap_init(&flist, &firstblock); + xfs_ilock(quotip, XFS_ILOCK_EXCL); + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + return (ESRCH); + } + + xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); + nmaps = 1; + error = xfs_bmapi_write(tp, quotip, offset_fsb, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &flist); + if (error) + goto error0; + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); + ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + + /* + * Keep track of the blkno to save a lookup later + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + /* now we can just get the buffer (there's nothing to read yet) */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0); + if (!bp) { + error = ENOMEM; + goto error1; + } + bp->b_ops = &xfs_dquot_buf_ops; + + /* + * Make a chunk of dquots out of this buffer and log + * the entire thing. + */ + xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + + /* + * xfs_bmap_finish() may commit the current transaction and + * start a second transaction if the freelist is not empty. + * + * Since we still want to modify this buffer, we need to + * ensure that the buffer is not released on commit of + * the first transaction and ensure the buffer is added to the + * second transaction. + * + * If there is only one transaction then don't stop the buffer + * from being released when it commits later on. + */ + + xfs_trans_bhold(tp, bp); + + if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + goto error1; + } + + if (committed) { + tp = *tpp; + xfs_trans_bjoin(tp, bp); + } else { + xfs_trans_bhold_release(tp, bp); + } + + *O_bpp = bp; + return 0; + + error1: + xfs_bmap_cancel(&flist); + error0: + xfs_iunlock(quotip, XFS_ILOCK_EXCL); + + return (error); +} + +STATIC int +xfs_qm_dqrepair( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_dquot *dqp, + xfs_dqid_t firstid, + struct xfs_buf **bpp) +{ + int error; + struct xfs_disk_dquot *ddq; + struct xfs_dqblk *d; + int i; + + /* + * Read the buffer without verification so we get the corrupted + * buffer returned to us. make sure we verify it on write, though. + */ + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, bpp, NULL); + + if (error) { + ASSERT(*bpp == NULL); + return XFS_ERROR(error); + } + (*bpp)->b_ops = &xfs_dquot_buf_ops; + + ASSERT(xfs_buf_islocked(*bpp)); + d = (struct xfs_dqblk *)(*bpp)->b_addr; + + /* Do the actual repair of dquots in this buffer */ + for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) { + ddq = &d[i].dd_diskdq; + error = xfs_dqcheck(mp, ddq, firstid + i, + dqp->dq_flags & XFS_DQ_ALLTYPES, + XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair"); + if (error) { + /* repair failed, we're screwed */ + xfs_trans_brelse(tp, *bpp); + return XFS_ERROR(EIO); + } + } + + return 0; +} + +/* + * Maps a dquot to the buffer containing its on-disk version. + * This returns a ptr to the buffer containing the on-disk dquot + * in the bpp param, and a ptr to the on-disk dquot within that buffer + */ +STATIC int +xfs_qm_dqtobp( + xfs_trans_t **tpp, + xfs_dquot_t *dqp, + xfs_disk_dquot_t **O_ddpp, + xfs_buf_t **O_bpp, + uint flags) +{ + struct xfs_bmbt_irec map; + int nmaps = 1, error; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_dq_to_quota_inode(dqp); + struct xfs_mount *mp = dqp->q_mount; + xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); + struct xfs_trans *tp = (tpp ? *tpp : NULL); + uint lock_mode; + + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + + lock_mode = xfs_ilock_data_map_shared(quotip); + if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we + * didn't have the quota inode lock. + */ + xfs_iunlock(quotip, lock_mode); + return ESRCH; + } + + /* + * Find the block map; no allocations yet + */ + error = xfs_bmapi_read(quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); + + xfs_iunlock(quotip, lock_mode); + if (error) + return error; + + ASSERT(nmaps == 1); + ASSERT(map.br_blockcount == 1); + + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); + + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) { + /* + * We don't allocate unless we're asked to + */ + if (!(flags & XFS_QMOPT_DQALLOC)) + return ENOENT; + + ASSERT(tp); + error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, + dqp->q_fileoffset, &bp); + if (error) + return error; + tp = *tpp; + } else { + trace_xfs_dqtobp_read(dqp); + + /* + * store the blkno etc so that we don't have to do the + * mapping all the time + */ + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); + + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, + 0, &bp, &xfs_dquot_buf_ops); + + if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { + xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * + mp->m_quotainfo->qi_dqperchunk; + ASSERT(bp == NULL); + error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp); + } + + if (error) { + ASSERT(bp == NULL); + return XFS_ERROR(error); + } + } + + ASSERT(xfs_buf_islocked(bp)); + *O_bpp = bp; + *O_ddpp = bp->b_addr + dqp->q_bufoffset; + + return (0); +} + + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. + * + * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. + */ +int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + uint flags, + struct xfs_dquot **O_dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_disk_dquot *ddqp; + struct xfs_buf *bp; + struct xfs_trans *tp = NULL; + int error; + int cancelflags = 0; + + + dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); + + dqp->dq_flags = type; + dqp->q_core.d_id = cpu_to_be32(id); + dqp->q_mount = mp; + INIT_LIST_HEAD(&dqp->q_lru); + mutex_init(&dqp->q_qlock); + init_waitqueue_head(&dqp->q_pinwait); + + /* + * Because we want to use a counting completion, complete + * the flush completion once to allow a single access to + * the flush completion without blocking. + */ + init_completion(&dqp->q_flush); + complete(&dqp->q_flush); + + /* + * Make sure group quotas have a different lock class than user + * quotas. + */ + switch (type) { + case XFS_DQ_USER: + /* uses the default lock class */ + break; + case XFS_DQ_GROUP: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_group_class); + break; + case XFS_DQ_PROJ: + lockdep_set_class(&dqp->q_qlock, &xfs_dquot_project_class); + break; + default: + ASSERT(0); + break; + } + + XFS_STATS_INC(xs_qm_dquot); + + trace_xfs_dqread(dqp); + + if (flags & XFS_QMOPT_DQALLOC) { + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0); + if (error) + goto error1; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; + } + + /* + * get a pointer to the on-disk dquot and the buffer containing it + * dqp already knows its own type (GROUP/USER). + */ + error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); + if (error) { + /* + * This can happen if quotas got turned off (ESRCH), + * or if the dquot didn't exist on disk and we ask to + * allocate (ENOENT). + */ + trace_xfs_dqread_fail(dqp); + cancelflags |= XFS_TRANS_ABORT; + goto error1; + } + + /* copy everything from disk dquot to the incore dquot */ + memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); + xfs_qm_dquot_logitem_init(dqp); + + /* + * Reservation counters are defined as reservation plus current usage + * to avoid having to add every time. + */ + dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); + dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); + dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount); + + /* initialize the dquot speculative prealloc thresholds */ + xfs_dquot_set_prealloc_limits(dqp); + + /* Mark the buf so that this will stay incore a little longer */ + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + + /* + * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) + * So we need to release with xfs_trans_brelse(). + * The strategy here is identical to that of inodes; we lock + * the dquot in xfs_qm_dqget() before making it accessible to + * others. This is because dquots, like inodes, need a good level of + * concurrency, and we don't want to take locks on the entire buffers + * for dquot accesses. + * Note also that the dquot buffer may even be dirty at this point, if + * this particular dquot was repaired. We still aren't afraid to + * brelse it because we have the changes incore. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_trans_brelse(tp, bp); + + if (tp) { + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error0; + } + + *O_dqpp = dqp; + return error; + +error1: + if (tp) + xfs_trans_cancel(tp, cancelflags); +error0: + xfs_qm_dqdestroy(dqp); + *O_dqpp = NULL; + return error; +} + +/* + * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a + * a locked dquot, doing an allocation (if requested) as needed. + * When both an inode and an id are given, the inode's id takes precedence. + * That is, if the id changes while we don't hold the ilock inside this + * function, the new dquot is returned, not necessarily the one requested + * in the id argument. + */ +int +xfs_qm_dqget( + xfs_mount_t *mp, + xfs_inode_t *ip, /* locked inode (optional) */ + xfs_dqid_t id, /* uid/projid/gid depending on type */ + uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ + uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ + xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || + (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || + (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { + return (ESRCH); + } + +#ifdef DEBUG + if (xfs_do_dqerror) { + if ((xfs_dqerror_target == mp->m_ddev_targp) && + (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { + xfs_debug(mp, "Returning error in dqget"); + return (EIO); + } + } + + ASSERT(type == XFS_DQ_USER || + type == XFS_DQ_PROJ || + type == XFS_DQ_GROUP); + if (ip) { + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + } +#endif + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (dqp) { + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(xs_qm_dqcachehits); + *O_dqpp = dqp; + return 0; + } + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(xs_qm_dqcachemisses); + + /* + * Dquot cache miss. We don't want to keep the inode lock across + * a (potential) disk read. Also we don't want to deal with the lock + * ordering between quotainode and this inode. OTOH, dropping the inode + * lock here means dealing with a chown that can happen before + * we re-acquire the lock. + */ + if (ip) + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + error = xfs_qm_dqread(mp, id, type, flags, &dqp); + + if (ip) + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (error) + return error; + + if (ip) { + /* + * A dquot could be attached to this inode by now, since + * we had dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { + xfs_qm_dqdestroy(dqp); + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; + } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return XFS_ERROR(ESRCH); + } + } + + mutex_lock(&qi->qi_tree_lock); + error = -radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + WARN_ON(error != EEXIST); + + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(xs_qm_dquot_dups); + goto restart; + } + + /* + * We return a locked dquot to the caller, with a reference taken + */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + dqret: + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return (0); +} + +/* + * Release a reference to the dquot (decrement ref-count) and unlock it. + * + * If there is a group quota attached to this dquot, carefully release that + * too without tripping over deadlocks'n'stuff. + */ +void +xfs_qm_dqput( + struct xfs_dquot *dqp) +{ + ASSERT(dqp->q_nrefs > 0); + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + trace_xfs_dqput(dqp); + + if (--dqp->q_nrefs == 0) { + struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo; + trace_xfs_dqput_free(dqp); + + if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) + XFS_STATS_INC(xs_qm_dquot_unused); + } + xfs_dqunlock(dqp); +} + +/* + * Release a dquot. Flush it if dirty, then dqput() it. + * dquot must not be locked. + */ +void +xfs_qm_dqrele( + xfs_dquot_t *dqp) +{ + if (!dqp) + return; + + trace_xfs_dqrele(dqp); + + xfs_dqlock(dqp); + /* + * We don't care to flush it if the dquot is dirty here. + * That will create stutters that we want to avoid. + * Instead we do a delayed write when we try to reclaim + * a dirty dquot. Also xfs_sync will take part of the burden... + */ + xfs_qm_dqput(dqp); +} + +/* + * This is the dquot flushing I/O completion routine. It is called + * from interrupt level when the buffer containing the dquot is + * flushed to disk. It is responsible for removing the dquot logitem + * from the AIL if it has not been re-logged, and unlocking the dquot's + * flush lock. This behavior is very similar to that of inodes.. + */ +STATIC void +xfs_qm_dqflush_done( + struct xfs_buf *bp, + struct xfs_log_item *lip) +{ + xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip; + xfs_dquot_t *dqp = qip->qli_dquot; + struct xfs_ail *ailp = lip->li_ailp; + + /* + * We only want to pull the item from the AIL if its + * location in the log has not changed since we started the flush. + * Thus, we only bother if the dquot's lsn has + * not changed. First we check the lsn outside the lock + * since it's cheaper, and then we recheck while + * holding the lock before removing the dquot from the AIL. + */ + if ((lip->li_flags & XFS_LI_IN_AIL) && + lip->li_lsn == qip->qli_flush_lsn) { + + /* xfs_trans_ail_delete() drops the AIL lock. */ + spin_lock(&ailp->xa_lock); + if (lip->li_lsn == qip->qli_flush_lsn) + xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&ailp->xa_lock); + } + + /* + * Release the dq's flush lock since we're done with it. + */ + xfs_dqfunlock(dqp); +} + +/* + * Write a modified dquot to disk. + * The dquot must be locked and the flush lock too taken by caller. + * The flush lock will not be unlocked until the dquot reaches the disk, + * but the dquot is free to be unlocked and modified by the caller + * in the interim. Dquot is still locked on return. This behavior is + * identical to that of inodes. + */ +int +xfs_qm_dqflush( + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_buf *bp; + struct xfs_disk_dquot *ddqp; + int error; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + ASSERT(!completion_done(&dqp->q_flush)); + + trace_xfs_dqflush(dqp); + + *bpp = NULL; + + xfs_qm_dqunpin_wait(dqp); + + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this dquot + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an emptry AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item *lip = &dqp->q_logitem.qli_item; + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + spin_lock(&mp->m_ail->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(mp->m_ail, lip, + SHUTDOWN_CORRUPT_INCORE); + else + spin_unlock(&mp->m_ail->xa_lock); + error = XFS_ERROR(EIO); + goto out_unlock; + } + + /* + * Get the buffer containing the on-disk dquot + */ + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); + if (error) + goto out_unlock; + + /* + * Calculate the location of the dquot inside the buffer. + */ + ddqp = bp->b_addr + dqp->q_bufoffset; + + /* + * A simple sanity check in case we got a corrupted dquot.. + */ + error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, + XFS_QMOPT_DOWARN, "dqflush (incore copy)"); + if (error) { + xfs_buf_relse(bp); + xfs_dqfunlock(dqp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return XFS_ERROR(EIO); + } + + /* This is the only portion of data that needs to persist */ + memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t)); + + /* + * Clear the dirty field and remember the flush lsn for later use. + */ + dqp->dq_flags &= ~XFS_DQ_DIRTY; + + xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn, + &dqp->q_logitem.qli_item.li_lsn); + + /* + * copy the lsn into the on-disk dquot now while we have the in memory + * dquot here. This can't be done later in the write verifier as we + * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; + + dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + /* + * Attach an iodone routine so that we can remove this dquot from the + * AIL and release the flush lock once the dquot is synced to disk. + */ + xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done, + &dqp->q_logitem.qli_item); + + /* + * If the buffer is pinned then push on the log so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) { + trace_xfs_dqflush_force(dqp); + xfs_log_force(mp, 0); + } + + trace_xfs_dqflush_done(dqp); + *bpp = bp; + return 0; + +out_unlock: + xfs_dqfunlock(dqp); + return XFS_ERROR(EIO); +} + +/* + * Lock two xfs_dquot structures. + * + * To avoid deadlocks we always lock the quota structure with + * the lowerd id first. + */ +void +xfs_dqlock2( + xfs_dquot_t *d1, + xfs_dquot_t *d2) +{ + if (d1 && d2) { + ASSERT(d1 != d2); + if (be32_to_cpu(d1->q_core.d_id) > + be32_to_cpu(d2->q_core.d_id)) { + mutex_lock(&d2->q_qlock); + mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED); + } else { + mutex_lock(&d1->q_qlock); + mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED); + } + } else if (d1) { + mutex_lock(&d1->q_qlock); + } else if (d2) { + mutex_lock(&d2->q_qlock); + } +} + +int __init +xfs_qm_init(void) +{ + xfs_qm_dqzone = + kmem_zone_init(sizeof(struct xfs_dquot), "xfs_dquot"); + if (!xfs_qm_dqzone) + goto out; + + xfs_qm_dqtrxzone = + kmem_zone_init(sizeof(struct xfs_dquot_acct), "xfs_dqtrx"); + if (!xfs_qm_dqtrxzone) + goto out_free_dqzone; + + return 0; + +out_free_dqzone: + kmem_zone_destroy(xfs_qm_dqzone); +out: + return -ENOMEM; +} + +void +xfs_qm_exit(void) +{ + kmem_zone_destroy(xfs_qm_dqtrxzone); + kmem_zone_destroy(xfs_qm_dqzone); +} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h new file mode 100644 index 00000000000..68a68f70483 --- /dev/null +++ b/fs/xfs/xfs_dquot.h @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_DQUOT_H__ +#define __XFS_DQUOT_H__ + +/* + * Dquots are structures that hold quota information about a user or a group, + * much like inodes are for files. In fact, dquots share many characteristics + * with inodes. However, dquots can also be a centralized resource, relative + * to a collection of inodes. In this respect, dquots share some characteristics + * of the superblock. + * XFS dquots exploit both those in its algorithms. They make every attempt + * to not be a bottleneck when quotas are on and have minimal impact, if any, + * when quotas are off. + */ + +struct xfs_mount; +struct xfs_trans; + +enum { + XFS_QLOWSP_1_PCNT = 0, + XFS_QLOWSP_3_PCNT, + XFS_QLOWSP_5_PCNT, + XFS_QLOWSP_MAX +}; + +/* + * The incore dquot structure + */ +typedef struct xfs_dquot { + uint dq_flags; /* various flags (XFS_DQ_*) */ + struct list_head q_lru; /* global free list of dquots */ + struct xfs_mount*q_mount; /* filesystem this relates to */ + struct xfs_trans*q_transp; /* trans this belongs to currently */ + uint q_nrefs; /* # active refs from inodes */ + xfs_daddr_t q_blkno; /* blkno of dquot buffer */ + int q_bufoffset; /* off of dq in buffer (# dquots) */ + xfs_fileoff_t q_fileoffset; /* offset in quotas file */ + + xfs_disk_dquot_t q_core; /* actual usage & quotas */ + xfs_dq_logitem_t q_logitem; /* dquot log item */ + xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */ + xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */ + xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */ + xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */ + xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */ + int64_t q_low_space[XFS_QLOWSP_MAX]; + struct mutex q_qlock; /* quota lock */ + struct completion q_flush; /* flush completion queue */ + atomic_t q_pincount; /* dquot pin count */ + wait_queue_head_t q_pinwait; /* dquot pinning wait queue */ +} xfs_dquot_t; + +/* + * Lock hierarchy for q_qlock: + * XFS_QLOCK_NORMAL is the implicit default, + * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 + */ +enum { + XFS_QLOCK_NORMAL = 0, + XFS_QLOCK_NESTED, +}; + +/* + * Manage the q_flush completion queue embedded in the dquot. This completion + * queue synchronizes processes attempting to flush the in-core dquot back to + * disk. + */ +static inline void xfs_dqflock(xfs_dquot_t *dqp) +{ + wait_for_completion(&dqp->q_flush); +} + +static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp) +{ + return try_wait_for_completion(&dqp->q_flush); +} + +static inline void xfs_dqfunlock(xfs_dquot_t *dqp) +{ + complete(&dqp->q_flush); +} + +static inline int xfs_dqlock_nowait(struct xfs_dquot *dqp) +{ + return mutex_trylock(&dqp->q_qlock); +} + +static inline void xfs_dqlock(struct xfs_dquot *dqp) +{ + mutex_lock(&dqp->q_qlock); +} + +static inline void xfs_dqunlock(struct xfs_dquot *dqp) +{ + mutex_unlock(&dqp->q_qlock); +} + +static inline int xfs_this_quota_on(struct xfs_mount *mp, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return XFS_IS_UQUOTA_ON(mp); + case XFS_DQ_GROUP: + return XFS_IS_GQUOTA_ON(mp); + case XFS_DQ_PROJ: + return XFS_IS_PQUOTA_ON(mp); + default: + return 0; + } +} + +static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type) +{ + switch (type & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return ip->i_udquot; + case XFS_DQ_GROUP: + return ip->i_gdquot; + case XFS_DQ_PROJ: + return ip->i_pdquot; + default: + return NULL; + } +} + +#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) +#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) +#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) +#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) +#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) + +extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, + uint, struct xfs_dquot **); +extern void xfs_qm_dqdestroy(xfs_dquot_t *); +extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); +extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); +extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, + xfs_disk_dquot_t *); +extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, + struct xfs_dquot *); +extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, + xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern void xfs_qm_dqput(xfs_dquot_t *); + +extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); + +extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *); + +static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) +{ + xfs_dqlock(dqp); + dqp->q_nrefs++; + xfs_dqunlock(dqp); + return dqp; +} + +#endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c new file mode 100644 index 00000000000..c2ac0c611ad --- /dev/null +++ b/fs/xfs/xfs_dquot_buf.c @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" + +int +xfs_calc_dquots_per_chunk( + unsigned int nbblks) /* basic block units */ +{ + unsigned int ndquots; + + ASSERT(nbblks > 0); + ndquots = BBTOB(nbblks); + do_div(ndquots, sizeof(xfs_dqblk_t)); + + return ndquots; +} + +/* + * Do some primitive error checking on ondisk dquot data structures. + */ +int +xfs_dqcheck( + struct xfs_mount *mp, + xfs_disk_dquot_t *ddq, + xfs_dqid_t id, + uint type, /* used only when IO_dorepair is true */ + uint flags, + char *str) +{ + xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; + int errs = 0; + + /* + * We can encounter an uninitialized dquot buffer for 2 reasons: + * 1. If we crash while deleting the quotainode(s), and those blks got + * used for user data. This is because we take the path of regular + * file deletion; however, the size field of quotainodes is never + * updated, so all the tricks that we play in itruncate_finish + * don't quite matter. + * + * 2. We don't play the quota buffers when there's a quotaoff logitem. + * But the allocation will be replayed so we'll end up with an + * uninitialized quota block. + * + * This is all fine; things are still consistent, and we haven't lost + * any quota information. Just don't complain about bad dquot blks. + */ + if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", + str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); + errs++; + } + if (ddq->d_version != XFS_DQUOT_VERSION) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", + str, id, ddq->d_version, XFS_DQUOT_VERSION); + errs++; + } + + if (ddq->d_flags != XFS_DQ_USER && + ddq->d_flags != XFS_DQ_PROJ && + ddq->d_flags != XFS_DQ_GROUP) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : XFS dquot ID 0x%x, unknown flags 0x%x", + str, id, ddq->d_flags); + errs++; + } + + if (id != -1 && id != be32_to_cpu(ddq->d_id)) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : ondisk-dquot 0x%p, ID mismatch: " + "0x%x expected, found id 0x%x", + str, ddq, id, be32_to_cpu(ddq->d_id)); + errs++; + } + + if (!errs && ddq->d_id) { + if (ddq->d_blk_softlimit && + be64_to_cpu(ddq->d_bcount) > + be64_to_cpu(ddq->d_blk_softlimit)) { + if (!ddq->d_btimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_ino_softlimit && + be64_to_cpu(ddq->d_icount) > + be64_to_cpu(ddq->d_ino_softlimit)) { + if (!ddq->d_itimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + if (ddq->d_rtb_softlimit && + be64_to_cpu(ddq->d_rtbcount) > + be64_to_cpu(ddq->d_rtb_softlimit)) { + if (!ddq->d_rtbtimer) { + if (flags & XFS_QMOPT_DOWARN) + xfs_alert(mp, + "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", + str, (int)be32_to_cpu(ddq->d_id), ddq); + errs++; + } + } + } + + if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) + return errs; + + if (flags & XFS_QMOPT_DOWARN) + xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); + + /* + * Typically, a repair is only requested by quotacheck. + */ + ASSERT(id != -1); + ASSERT(flags & XFS_QMOPT_DQREPAIR); + memset(d, 0, sizeof(xfs_dqblk_t)); + + d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + d->dd_diskdq.d_version = XFS_DQUOT_VERSION; + d->dd_diskdq.d_flags = type; + d->dd_diskdq.d_id = cpu_to_be32(id); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + + return errs; +} + +STATIC bool +xfs_dquot_buf_verify_crc( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + int ndquots; + int i; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return true; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk( + XFS_BB_TO_FSB(mp, bp->b_length)); + + for (i = 0; i < ndquots; i++, d++) { + if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF)) + return false; + if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) + return false; + } + return true; +} + +STATIC bool +xfs_dquot_buf_verify( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + xfs_dqid_t id = 0; + int ndquots; + int i; + + /* + * if we are in log recovery, the quota subsystem has not been + * initialised so we have no quotainfo structure. In that case, we need + * to manually calculate the number of dquots in the buffer. + */ + if (mp->m_quotainfo) + ndquots = mp->m_quotainfo->qi_dqperchunk; + else + ndquots = xfs_calc_dquots_per_chunk(bp->b_length); + + /* + * On the first read of the buffer, verify that each dquot is valid. + * We don't know what the id of the dquot is supposed to be, just that + * they should be increasing monotonically within the buffer. If the + * first id is corrupt, then it will fail on the second dquot in the + * buffer so corruptions could point to the wrong dquot in this case. + */ + for (i = 0; i < ndquots; i++) { + struct xfs_disk_dquot *ddq; + int error; + + ddq = &d[i].dd_diskdq; + + if (i == 0) + id = be32_to_cpu(ddq->d_id); + + error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, + "xfs_dquot_buf_verify"); + if (error) + return false; + } + return true; +} + +static void +xfs_dquot_buf_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_dquot_buf_verify(mp, bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ +static void +xfs_dquot_buf_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify(mp, bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } +} + +const struct xfs_buf_ops xfs_dquot_buf_ops = { + .verify_read = xfs_dquot_buf_read_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; + diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c new file mode 100644 index 00000000000..f33fbaaa4d8 --- /dev/null +++ b/fs/xfs/xfs_dquot_item.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2000-2003 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_trans_priv.h" +#include "xfs_qm.h" +#include "xfs_log.h" + +static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_dq_logitem, qli_item); +} + +/* + * returns the number of iovecs needed to log the given dquot item. + */ +STATIC void +xfs_qm_dquot_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 2; + *nbytes += sizeof(struct xfs_dq_logformat) + + sizeof(struct xfs_disk_dquot); +} + +/* + * fills in the vector of log iovecs for the given dquot log item. + */ +STATIC void +xfs_qm_dquot_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_dq_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT); + qlf->qlf_type = XFS_LI_DQUOT; + qlf->qlf_size = 2; + qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id); + qlf->qlf_blkno = qlip->qli_dquot->q_blkno; + qlf->qlf_len = 1; + qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT, + &qlip->qli_dquot->q_core, + sizeof(struct xfs_disk_dquot)); +} + +/* + * Increment the pin count of the given dquot. + */ +STATIC void +xfs_qm_dquot_logitem_pin( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + atomic_inc(&dqp->q_pincount); +} + +/* + * Decrement the pin count of the given dquot, and wake up + * anyone in xfs_dqwait_unpin() if the count goes to 0. The + * dquot must have been previously pinned with a call to + * xfs_qm_dquot_logitem_pin(). + */ +STATIC void +xfs_qm_dquot_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(atomic_read(&dqp->q_pincount) > 0); + if (atomic_dec_and_test(&dqp->q_pincount)) + wake_up(&dqp->q_pinwait); +} + +STATIC xfs_lsn_t +xfs_qm_dquot_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + /* + * We always re-log the entire dquot when it becomes dirty, + * so, the latest copy _is_ the only one that matters. + */ + return lsn; +} + +/* + * This is called to wait for the given dquot to be unpinned. + * Most of these pin/unpin routines are plagiarized from inode code. + */ +void +xfs_qm_dqunpin_wait( + struct xfs_dquot *dqp) +{ + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + if (atomic_read(&dqp->q_pincount) == 0) + return; + + /* + * Give the log a push so we don't wait here too long. + */ + xfs_log_force(dqp->q_mount, 0); + wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); +} + +STATIC uint +xfs_qm_dquot_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) __releases(&lip->li_ailp->xa_lock) + __acquires(&lip->li_ailp->xa_lock) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; + + if (atomic_read(&dqp->q_pincount) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_dqlock_nowait(dqp)) + return XFS_ITEM_LOCKED; + + /* + * Re-check the pincount now that we stabilized the value by + * taking the quota lock. + */ + if (atomic_read(&dqp->q_pincount) > 0) { + rval = XFS_ITEM_PINNED; + goto out_unlock; + } + + /* + * Someone else is already flushing the dquot. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ + if (!xfs_dqflock_nowait(dqp)) { + rval = XFS_ITEM_FLUSHING; + goto out_unlock; + } + + spin_unlock(&lip->li_ailp->xa_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p", + __func__, error, dqp); + } else { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); + } + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_dqunlock(dqp); + return rval; +} + +/* + * Unlock the dquot associated with the log item. + * Clear the fields of the dquot and dquot log item that + * are specific to the current transaction. If the + * hold flags is set, do not unlock the dquot. + */ +STATIC void +xfs_qm_dquot_logitem_unlock( + struct xfs_log_item *lip) +{ + struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; + + ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + /* + * Clear the transaction pointer in the dquot + */ + dqp->q_transp = NULL; + + /* + * dquots are never 'held' from getting unlocked at the end of + * a transaction. Their locking and unlocking is hidden inside the + * transaction layer, within trans_commit. Hence, no LI_HOLD flag + * for the logitem. + */ + xfs_dqunlock(dqp); +} + +/* + * this needs to stamp an lsn into the dquot, I think. + * rpc's that look at user dquot's would then have to + * push on the dependency recorded in the dquot + */ +STATIC void +xfs_qm_dquot_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector for dquots + */ +static const struct xfs_item_ops xfs_dquot_item_ops = { + .iop_size = xfs_qm_dquot_logitem_size, + .iop_format = xfs_qm_dquot_logitem_format, + .iop_pin = xfs_qm_dquot_logitem_pin, + .iop_unpin = xfs_qm_dquot_logitem_unpin, + .iop_unlock = xfs_qm_dquot_logitem_unlock, + .iop_committed = xfs_qm_dquot_logitem_committed, + .iop_push = xfs_qm_dquot_logitem_push, + .iop_committing = xfs_qm_dquot_logitem_committing +}; + +/* + * Initialize the dquot log item for a newly allocated dquot. + * The dquot isn't locked at this point, but it isn't on any of the lists + * either, so we don't care. + */ +void +xfs_qm_dquot_logitem_init( + struct xfs_dquot *dqp) +{ + struct xfs_dq_logitem *lp = &dqp->q_logitem; + + xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT, + &xfs_dquot_item_ops); + lp->qli_dquot = dqp; +} + +/*------------------ QUOTAOFF LOG ITEMS -------------------*/ + +static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_qoff_logitem, qql_item); +} + + +/* + * This returns the number of iovecs needed to log the given quotaoff item. + * We only need 1 iovec for an quotaoff item. It just logs the + * quotaoff_log_format structure. + */ +STATIC void +xfs_qm_qoff_logitem_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_qoff_logitem); +} + +STATIC void +xfs_qm_qoff_logitem_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + struct xfs_qoff_logformat *qlf; + + qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); + qlf->qf_type = XFS_LI_QUOTAOFF; + qlf->qf_size = 1; + qlf->qf_flags = qflip->qql_flags; + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); +} + +/* + * Pinning has no meaning for an quotaoff item, so just return. + */ +STATIC void +xfs_qm_qoff_logitem_pin( + struct xfs_log_item *lip) +{ +} + +/* + * Since pinning has no meaning for an quotaoff item, unpinning does + * not either. + */ +STATIC void +xfs_qm_qoff_logitem_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +/* + * There isn't much you can do to push a quotaoff item. It is simply + * stuck waiting for the log to be flushed to disk. + */ +STATIC uint +xfs_qm_qoff_logitem_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + return XFS_ITEM_LOCKED; +} + +/* + * Quotaoff items have no locking or pushing, so return failure + * so that the caller doesn't bother with us. + */ +STATIC void +xfs_qm_qoff_logitem_unlock( + struct xfs_log_item *lip) +{ +} + +/* + * The quotaoff-start-item is logged only once and cannot be moved in the log, + * so simply return the lsn at which it's been logged. + */ +STATIC xfs_lsn_t +xfs_qm_qoff_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + return lsn; +} + +STATIC xfs_lsn_t +xfs_qm_qoffend_logitem_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); + struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; + struct xfs_ail *ailp = qfs->qql_item.li_ailp; + + /* + * Delete the qoff-start logitem from the AIL. + * xfs_trans_ail_delete() drops the AIL lock. + */ + spin_lock(&ailp->xa_lock); + xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); + + kmem_free(qfs); + kmem_free(qfe); + return (xfs_lsn_t)-1; +} + +/* + * XXX rcc - don't know quite what to do with this. I think we can + * just ignore it. The only time that isn't the case is if we allow + * the client to somehow see that quotas have been turned off in which + * we can't allow that to get back until the quotaoff hits the disk. + * So how would that happen? Also, do we need different routines for + * quotaoff start and quotaoff end? I suspect the answer is yes but + * to be sure, I need to look at the recovery code and see how quota off + * recovery is handled (do we roll forward or back or do something else). + * If we roll forwards or backwards, then we need two separate routines, + * one that does nothing and one that stamps in the lsn that matters + * (truly makes the quotaoff irrevocable). If we do something else, + * then maybe we don't need two. + */ +STATIC void +xfs_qm_qoff_logitem_committing( + struct xfs_log_item *lip, + xfs_lsn_t commit_lsn) +{ +} + +static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoffend_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * This is the ops vector shared by all quotaoff-start log items. + */ +static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { + .iop_size = xfs_qm_qoff_logitem_size, + .iop_format = xfs_qm_qoff_logitem_format, + .iop_pin = xfs_qm_qoff_logitem_pin, + .iop_unpin = xfs_qm_qoff_logitem_unpin, + .iop_unlock = xfs_qm_qoff_logitem_unlock, + .iop_committed = xfs_qm_qoff_logitem_committed, + .iop_push = xfs_qm_qoff_logitem_push, + .iop_committing = xfs_qm_qoff_logitem_committing +}; + +/* + * Allocate and initialize an quotaoff item of the correct quota type(s). + */ +struct xfs_qoff_logitem * +xfs_qm_qoff_logitem_init( + struct xfs_mount *mp, + struct xfs_qoff_logitem *start, + uint flags) +{ + struct xfs_qoff_logitem *qf; + + qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP); + + xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? + &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); + qf->qql_item.li_mountp = mp; + qf->qql_start_lip = start; + qf->qql_flags = flags; + return qf; +} diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 5a632531f84..502e9464634 100644 --- a/fs/xfs/quota/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -27,17 +27,12 @@ typedef struct xfs_dq_logitem { xfs_log_item_t qli_item; /* common portion */ struct xfs_dquot *qli_dquot; /* dquot ptr */ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ - unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */ -#ifdef DEBUG - uint64_t qli_push_owner; -#endif - xfs_dq_logformat_t qli_format; /* logged structure */ } xfs_dq_logitem_t; typedef struct xfs_qoff_logitem { xfs_log_item_t qql_item; /* common portion */ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - xfs_qoff_logformat_t qql_format; /* logged structure */ + unsigned int qql_flags; } xfs_qoff_logitem_t; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index b95681b03d8..edac5b057d2 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -16,21 +16,13 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" +#include "xfs_format.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_utils.h" #include "xfs_error.h" #ifdef DEBUG @@ -51,27 +43,17 @@ xfs_error_trap(int e) break; if (e != xfs_etrap[i]) continue; - cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); + xfs_notice(NULL, "%s: error %d", __func__, e); BUG(); break; } return e; } -#endif - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) int xfs_etest[XFS_NUM_INJECT_ERROR]; int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; - -void -xfs_error_test_init(void) -{ - memset(xfs_etest, 0, sizeof(xfs_etest)); - memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid)); - memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname)); -} +int xfs_error_test_active; int xfs_error_test(int error_tag, int *fsidp, char *expression, @@ -80,14 +62,14 @@ xfs_error_test(int error_tag, int *fsidp, char *expression, int i; int64_t fsid; - if (random() % randfactor) + if (prandom_u32() % randfactor) return 0; memcpy(&fsid, fsidp, sizeof(xfs_fsid_t)); for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { - cmn_err(CE_WARN, + xfs_warn(NULL, "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", expression, file, line, xfs_etest_fsname[i]); return 1; @@ -108,201 +90,116 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp) for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); + xfs_warn(mp, "error tag #%d on", error_tag); return 0; } } for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { if (xfs_etest[i] == 0) { - cmn_err(CE_WARN, "Turned on XFS error tag #%d", + xfs_warn(mp, "Turned on XFS error tag #%d", error_tag); xfs_etest[i] = error_tag; xfs_etest_fsid[i] = fsid; len = strlen(mp->m_fsname); xfs_etest_fsname[i] = kmem_alloc(len + 1, KM_SLEEP); strcpy(xfs_etest_fsname[i], mp->m_fsname); + xfs_error_test_active++; return 0; } } - cmn_err(CE_WARN, "error tag overflow, too many turned on"); + xfs_warn(mp, "error tag overflow, too many turned on"); return 1; } int -xfs_errortag_clear(int error_tag, xfs_mount_t *mp) +xfs_errortag_clearall(xfs_mount_t *mp, int loud) { - int i; int64_t fsid; + int cleared = 0; + int i; memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { - if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { - xfs_etest[i] = 0; - xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); - xfs_etest_fsname[i] = NULL; - cmn_err(CE_WARN, "Cleared XFS error tag #%d", - error_tag); - return 0; - } - } - - cmn_err(CE_WARN, "XFS error tag %d not on", error_tag); - - return 1; -} - -int -xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud) -{ - int i; - int cleared = 0; for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && xfs_etest[i] != 0) { cleared = 1; - cmn_err(CE_WARN, "Clearing XFS error tag #%d", + xfs_warn(mp, "Clearing XFS error tag #%d", xfs_etest[i]); xfs_etest[i] = 0; xfs_etest_fsid[i] = 0LL; - kmem_free(xfs_etest_fsname[i], - strlen(xfs_etest_fsname[i]) + 1); + kmem_free(xfs_etest_fsname[i]); xfs_etest_fsname[i] = NULL; + xfs_error_test_active--; } } if (loud || cleared) - cmn_err(CE_WARN, - "Cleared all XFS error tags for filesystem \"%s\"", - fsname); + xfs_warn(mp, "Cleared all XFS error tags for filesystem"); return 0; } - -int -xfs_errortag_clearall(xfs_mount_t *mp) -{ - int64_t fsid; - - memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); - - return xfs_errortag_clearall_umount(fsid, mp->m_fsname, 1); -} -#endif /* DEBUG || INDUCE_IO_ERROR */ - -static void -xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap) -{ - if (mp != NULL) { - char *newfmt; - int len = 16 + mp->m_fsname_len + strlen(fmt); - - newfmt = kmem_alloc(len, KM_SLEEP); - sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt); - icmn_err(level, newfmt, ap); - kmem_free(newfmt, len); - } else { - icmn_err(level, fmt, ap); - } -} +#endif /* DEBUG */ void -xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...) +xfs_error_report( + const char *tag, + int level, + struct xfs_mount *mp, + const char *filename, + int linenum, + inst_t *ra) { - va_list ap; + if (level <= xfs_error_level) { + xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, + "Internal error %s at line %d of file %s. Caller %pF", + tag, linenum, filename, ra); - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); + xfs_stack_trace(); + } } void -xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...) +xfs_corruption_error( + const char *tag, + int level, + struct xfs_mount *mp, + void *p, + const char *filename, + int linenum, + inst_t *ra) { - va_list ap; - -#ifdef DEBUG - xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; -#endif - - if (xfs_panic_mask && (xfs_panic_mask & panic_tag) - && (level & CE_ALERT)) { - level &= ~CE_ALERT; - level |= CE_PANIC; - cmn_err(CE_ALERT, "XFS: Transforming an alert into a BUG."); - } - va_start(ap, fmt); - xfs_fs_vcmn_err(level, mp, fmt, ap); - va_end(ap); + if (level <= xfs_error_level) + xfs_hex_dump(p, 64); + xfs_error_report(tag, level, mp, filename, linenum, ra); + xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair"); } +/* + * Warnings specifically for verifier errors. Differentiate CRC vs. invalid + * values, and omit the stack trace unless the error level is tuned high. + */ void -xfs_error_report( - char *tag, - int level, - xfs_mount_t *mp, - char *fname, - int linenum, - inst_t *ra) +xfs_verifier_error( + struct xfs_buf *bp) { - if (level <= xfs_error_level) { - xfs_cmn_err(XFS_PTAG_ERROR_REPORT, - CE_ALERT, mp, - "XFS internal error %s at line %d of file %s. Caller 0x%p\n", - tag, linenum, fname, ra); - - xfs_stack_trace(); - } -} + struct xfs_mount *mp = bp->b_target->bt_mount; -STATIC void -xfs_hex_dump(void *p, int length) -{ - __uint8_t *uip = (__uint8_t*)p; - int i; - char sbuf[128], *s; + xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + bp->b_error == EFSBADCRC ? "CRC error" : "corruption", + __return_address, bp->b_bn); - s = sbuf; - *s = '\0'; - for (i=0; i<length; i++, uip++) { - if ((i % 16) == 0) { - if (*s != '\0') - cmn_err(CE_ALERT, "%s\n", sbuf); - s = sbuf; - sprintf(s, "0x%x: ", i); - while( *s != '\0') - s++; - } - sprintf(s, "%02x ", *uip); + xfs_alert(mp, "Unmount and run xfs_repair"); - /* - * the kernel sprintf is a void; user sprintf returns - * the sprintf'ed string's length. Find the new end- - * of-string - */ - while( *s != '\0') - s++; + if (xfs_error_level >= XFS_ERRLEVEL_LOW) { + xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:"); + xfs_hex_dump(xfs_buf_offset(bp, 0), 64); } - cmn_err(CE_ALERT, "%s\n", sbuf); -} -void -xfs_corruption_error( - char *tag, - int level, - xfs_mount_t *mp, - void *p, - char *fname, - int linenum, - inst_t *ra) -{ - if (level <= xfs_error_level) - xfs_hex_dump(p, 16); - xfs_error_report(tag, level, mp, fname, linenum, ra); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); } diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index bc43163456e..c1c57d4a4b5 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -18,14 +18,6 @@ #ifndef __XFS_ERROR_H__ #define __XFS_ERROR_H__ -#define XFS_ERECOVER 1 /* Failure to recover log */ -#define XFS_ELOGSTAT 2 /* Failure to stat log in user space */ -#define XFS_ENOLOGSPACE 3 /* Reservation too large */ -#define XFS_ENOTSUP 4 /* Operation not supported */ -#define XFS_ENOLSN 5 /* Can't find the lsn you asked for */ -#define XFS_ENOTFOUND 6 -#define XFS_ENOTXFS 7 /* Not XFS filesystem */ - #ifdef DEBUG #define XFS_ERROR_NTRAP 10 extern int xfs_etrap[XFS_ERROR_NTRAP]; @@ -37,10 +29,12 @@ extern int xfs_error_trap(int); struct xfs_mount; -extern void xfs_error_report(char *tag, int level, struct xfs_mount *mp, - char *fname, int linenum, inst_t *ra); -extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, - void *p, char *fname, int linenum, inst_t *ra); +extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, + const char *filename, int linenum, inst_t *ra); +extern void xfs_corruption_error(const char *tag, int level, + struct xfs_mount *mp, void *p, const char *filename, + int linenum, inst_t *ra); +extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address) @@ -133,39 +127,27 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp, #define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10) #define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) +#ifdef DEBUG +extern int xfs_error_test_active; extern int xfs_error_test(int, int *, char *, int, char *, unsigned long); -extern void xfs_error_test_init(void); #define XFS_NUM_INJECT_ERROR 10 - -#ifdef __ANSI_CPP__ -#define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ - xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \ - (rf))) -#else #define XFS_TEST_ERROR(expr, mp, tag, rf) \ - ((expr) || \ + ((expr) || (xfs_error_test_active && \ xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ - (rf))) -#endif /* __ANSI_CPP__ */ + (rf)))) -extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clear(int error_tag, xfs_mount_t *mp); -extern int xfs_errortag_clearall(xfs_mount_t *mp); -extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); +extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); +extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); #else #define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) #define xfs_errortag_add(tag, mp) (ENOSYS) -#define xfs_errortag_clearall(mp) (ENOSYS) -#endif /* (DEBUG || INDUCE_IO_ERROR) */ +#define xfs_errortag_clearall(mp, loud) (ENOSYS) +#endif /* DEBUG */ /* - * XFS panic tags -- allow a call to xfs_cmn_err() be turned into - * a panic by setting xfs_panic_mask in a - * sysctl. update xfs_max[XFS_PARAM] if - * more are added. + * XFS panic tags -- allow a call to xfs_alert_tag() be turned into + * a panic by setting xfs_panic_mask in a sysctl. */ #define XFS_NO_PTAG 0 #define XFS_PTAG_IFLUSH 0x00000001 @@ -175,18 +157,6 @@ extern int xfs_errortag_clearall_umount(int64_t fsid, char *fsname, int loud); #define XFS_PTAG_SHUTDOWN_CORRUPT 0x00000010 #define XFS_PTAG_SHUTDOWN_IOERROR 0x00000020 #define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 - -struct xfs_mount; -/* PRINTFLIKE4 */ -extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp, - char *fmt, ...); -/* PRINTFLIKE3 */ -extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); - -#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ - xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) - -#define xfs_fs_mount_cmn_err(f, fmt, args...) \ - ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) +#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c new file mode 100644 index 00000000000..753e467aa1a --- /dev/null +++ b/fs/xfs/xfs_export.c @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_dir2.h" +#include "xfs_export.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" + +/* + * Note that we only accept fileids which are long enough rather than allow + * the parent generation number to default to zero. XFS considers zero a + * valid generation number not an invalid/wildcard value. + */ +static int xfs_fileid_length(int fileid_type) +{ + switch (fileid_type) { + case FILEID_INO32_GEN: + return 2; + case FILEID_INO32_GEN_PARENT: + return 4; + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + return 3; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + return 6; + } + return FILEID_INVALID; +} + +STATIC int +xfs_fs_encode_fh( + struct inode *inode, + __u32 *fh, + int *max_len, + struct inode *parent) +{ + struct fid *fid = (struct fid *)fh; + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; + int fileid_type; + int len; + + /* Directories don't need their parent encoded, they have ".." */ + if (!parent) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + + /* + * If the the filesystem may contain 64bit inode numbers, we need + * to use larger file handles that can represent them. + * + * While we only allocate inodes that do not fit into 32 bits any + * large enough filesystem may contain them, thus the slightly + * confusing looking conditional below. + */ + if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || + (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + fileid_type |= XFS_FILEID_TYPE_64FLAG; + + /* + * Only encode if there is enough space given. In practice + * this means we can't export a filesystem with 64bit inodes + * over NFSv2 with the subtree_check export option; the other + * seven combinations work. The real answer is "don't use v2". + */ + len = xfs_fileid_length(fileid_type); + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + *max_len = len; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + fid->i32.parent_ino = XFS_I(parent)->i_ino; + fid->i32.parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN: + fid->i32.ino = XFS_I(inode)->i_ino; + fid->i32.gen = inode->i_generation; + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + fid64->parent_ino = XFS_I(parent)->i_ino; + fid64->parent_gen = parent->i_generation; + /*FALLTHRU*/ + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + fid64->ino = XFS_I(inode)->i_ino; + fid64->gen = inode->i_generation; + break; + } + + return fileid_type; +} + +STATIC struct inode * +xfs_nfs_get_inode( + struct super_block *sb, + u64 ino, + u32 generation) + { + xfs_mount_t *mp = XFS_M(sb); + xfs_inode_t *ip; + int error; + + /* + * NFS can sometimes send requests for ino 0. Fail them gracefully. + */ + if (ino == 0) + return ERR_PTR(-ESTALE); + + /* + * The XFS_IGET_UNTRUSTED means that an invalid inode number is just + * fine and not an indication of a corrupted filesystem as clients can + * send invalid file handles and we have to handle it gracefully.. + */ + error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip); + if (error) { + /* + * EINVAL means the inode cluster doesn't exist anymore. + * This implies the filehandle is stale, so we should + * translate it here. + * We don't use ESTALE directly down the chain to not + * confuse applications using bulkstat that expect EINVAL. + */ + if (error == EINVAL || error == ENOENT) + error = ESTALE; + return ERR_PTR(-error); + } + + if (ip->i_d.di_gen != generation) { + IRELE(ip); + return ERR_PTR(-ESTALE); + } + + return VFS_I(ip); +} + +STATIC struct dentry * +xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + case FILEID_INO32_GEN: + inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fileid_type) +{ + struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fid; + struct inode *inode = NULL; + + if (fh_len < xfs_fileid_length(fileid_type)) + return NULL; + + switch (fileid_type) { + case FILEID_INO32_GEN_PARENT: + inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino, + fid->i32.parent_gen); + break; + case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG: + inode = xfs_nfs_get_inode(sb, fid64->parent_ino, + fid64->parent_gen); + break; + } + + return d_obtain_alias(inode); +} + +STATIC struct dentry * +xfs_fs_get_parent( + struct dentry *child) +{ + int error; + struct xfs_inode *cip; + + error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); + if (unlikely(error)) + return ERR_PTR(-error); + + return d_obtain_alias(VFS_I(cip)); +} + +STATIC int +xfs_fs_nfs_commit_metadata( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +const struct export_operations xfs_export_operations = { + .encode_fh = xfs_fs_encode_fh, + .fh_to_dentry = xfs_fs_fh_to_dentry, + .fh_to_parent = xfs_fs_fh_to_parent, + .get_parent = xfs_fs_get_parent, + .commit_metadata = xfs_fs_nfs_commit_metadata, +}; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/xfs_export.h index e794ca4efc7..3272b6ae7a3 100644 --- a/fs/xfs/linux-2.6/xfs_export.h +++ b/fs/xfs/xfs_export.h @@ -59,50 +59,14 @@ * a subdirectory) or use the "fsid" export option. */ +struct xfs_fid64 { + u64 ino; + u32 gen; + u64 parent_ino; + u32 parent_gen; +} __attribute__((packed)); + /* This flag goes on the wire. Don't play with it. */ #define XFS_FILEID_TYPE_64FLAG 0x80 /* NFS fileid has 64bit inodes */ -/* Calculate the length in u32 units of the fileid data */ -static inline int -xfs_fileid_length(int hasparent, int is64) -{ - return hasparent ? (is64 ? 6 : 4) : (is64 ? 3 : 2); -} - -/* - * Decode encoded inode information (either for the inode itself - * or the parent) into an xfs_fid2_t structure. Advances and - * returns the new data pointer - */ -static inline __u32 * -xfs_fileid_decode_fid2(__u32 *p, xfs_fid2_t *fid, int is64) -{ - fid->fid_len = sizeof(xfs_fid2_t) - sizeof(fid->fid_len); - fid->fid_pad = 0; - fid->fid_ino = *p++; -#if XFS_BIG_INUMS - if (is64) - fid->fid_ino |= (((__u64)(*p++)) << 32); -#endif - fid->fid_gen = *p++; - return p; -} - -/* - * Encode inode information (either for the inode itself or the - * parent) into a fileid buffer. Advances and returns the new - * data pointer. - */ -static inline __u32 * -xfs_fileid_encode_inode(__u32 *p, struct inode *inode, int is64) -{ - *p++ = (__u32)inode->i_ino; -#if XFS_BIG_INUMS - if (is64) - *p++ = (__u32)(inode->i_ino >> 32); -#endif - *p++ = inode->i_generation; - return p; -} - #endif /* __XFS_EXPORT_H__ */ diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c new file mode 100644 index 00000000000..fd22f69049d --- /dev/null +++ b/fs/xfs/xfs_extent_busy.c @@ -0,0 +1,605 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_trace.h" +#include "xfs_trans.h" +#include "xfs_log.h" + +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + struct xfs_extent_busy *new; + struct xfs_extent_busy *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent = NULL; + + new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL); + if (!new) { + /* + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. + */ + trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len); + xfs_trans_set_sync(tp); + return; + } + + new->agno = agno; + new->bno = bno; + new->length = len; + INIT_LIST_HEAD(&new->list); + new->flags = flags; + + /* trace before insert to be able to see failed inserts */ + trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); + + pag = xfs_perag_get(tp->t_mountp, new->agno); + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + while (*rbp) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); + + if (new->bno < busyp->bno) { + rbp = &(*rbp)->rb_left; + ASSERT(new->bno + new->length <= busyp->bno); + } else if (new->bno > busyp->bno) { + rbp = &(*rbp)->rb_right; + ASSERT(bno >= busyp->bno + busyp->length); + } else { + ASSERT(0); + } + } + + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_extent_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. + */ +int +xfs_extent_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_extent_busy *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + return match; +} + +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entry's block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_extent_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) __releases(&pag->pagb_lock) + __acquires(&pag->pagb_lock) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * This extent is currently being discarded. Give the thread + * performing the discard a chance to mark the extent unbusy + * and retry. + */ + if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { + spin_unlock(&pag->pagb_lock); + delay(1); + spin_lock(&pag->pagb_lock); + return false; + } + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ +void +xfs_extent_busy_reuse( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + struct xfs_perag *pag; + struct rb_node *rbp; + + ASSERT(flen > 0); + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +void +xfs_extent_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_extent_busy *busyp = + rb_entry(rbp, struct xfs_extent_busy, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata && + !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { + if (!xfs_extent_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +STATIC void +xfs_extent_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_extent_busy *busyp) +{ + if (busyp->length) { + trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); + kmem_free(busyp); +} + +/* + * Remove all extents on the passed in list from the busy extents tree. + * If do_discard is set skip extents that need to be discarded, and mark + * these as undergoing a discard operation instead. + */ +void +xfs_extent_busy_clear( + struct xfs_mount *mp, + struct list_head *list, + bool do_discard) +{ + struct xfs_extent_busy *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + if (do_discard && busyp->length && + !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) + busyp->flags = XFS_EXTENT_BUSY_DISCARDED; + else + xfs_extent_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_extent_busy_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_extent_busy, list)->agno - + container_of(b, struct xfs_extent_busy, list)->agno; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h new file mode 100644 index 00000000000..bfff284d2dc --- /dev/null +++ b/fs/xfs/xfs_extent_busy.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2010 David Chinner. + * Copyright (c) 2011 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_EXTENT_BUSY_H__ +#define __XFS_EXTENT_BUSY_H__ + +struct xfs_mount; +struct xfs_trans; +struct xfs_alloc_arg; + +/* + * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that + * have been freed but whose transactions aren't committed to disk yet. + * + * Note that we use the transaction ID to record the transaction, not the + * transaction structure itself. See xfs_extent_busy_insert() for details. + */ +struct xfs_extent_busy { + struct rb_node rb_node; /* ag by-bno indexed search tree */ + struct list_head list; /* transaction busy extent list */ + xfs_agnumber_t agno; + xfs_agblock_t bno; + xfs_extlen_t length; + unsigned int flags; +#define XFS_EXTENT_BUSY_DISCARDED 0x01 /* undergoing a discard op. */ +#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ +}; + +void +xfs_extent_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); + +void +xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, + bool do_discard); + +int +xfs_extent_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +void +xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno, + xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen); + +int +xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_extent_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_extent_busy_ag_cmp); +} + +#endif /* __XFS_EXTENT_BUSY_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cf6d8769b9..fb7a4c1ce1c 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -17,36 +17,55 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_extfree_item.h" +#include "xfs_log.h" kmem_zone_t *xfs_efi_zone; kmem_zone_t *xfs_efd_zone; -STATIC void xfs_efi_item_unlock(xfs_efi_log_item_t *); -STATIC void xfs_efi_item_abort(xfs_efi_log_item_t *); -STATIC void xfs_efd_item_abort(xfs_efd_log_item_t *); - +static inline struct xfs_efi_log_item *EFI_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_efi_log_item, efi_item); +} void -xfs_efi_item_free(xfs_efi_log_item_t *efip) +xfs_efi_item_free( + struct xfs_efi_log_item *efip) { - int nexts = efip->efi_format.efi_nextents; - - if (nexts > XFS_EFI_MAX_FAST_EXTENTS) { - kmem_free(efip, sizeof(xfs_efi_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); - } else { + if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) + kmem_free(efip); + else kmem_zone_free(xfs_efi_zone, efip); +} + +/* + * Freeing the efi requires that we remove it from the AIL if it has already + * been placed there. However, the EFI may not yet have been placed in the AIL + * when called by xfs_efi_release() from EFD processing due to the ordering of + * committed vs unpin operations in bulk insert operations. Hence the reference + * count to ensure only the last caller frees the EFI. + */ +STATIC void +__xfs_efi_release( + struct xfs_efi_log_item *efip) +{ + struct xfs_ail *ailp = efip->efi_item.li_ailp; + + if (atomic_dec_and_test(&efip->efi_refcount)) { + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &efip->efi_item, + SHUTDOWN_LOG_IO_ERROR); + xfs_efi_item_free(efip); } } @@ -55,11 +74,22 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip) * We only need 1 iovec for an efi item. It just logs the efi_log_format * structure. */ -/*ARGSUSED*/ -STATIC uint -xfs_efi_item_size(xfs_efi_log_item_t *efip) +static inline int +xfs_efi_item_sizeof( + struct xfs_efi_log_item *efip) +{ + return sizeof(struct xfs_efi_log_format) + + (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efi_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efi_item_sizeof(EFI_ITEM(lip)); } /* @@ -70,226 +100,151 @@ xfs_efi_item_size(xfs_efi_log_item_t *efip) * slots in the efi item have been filled. */ STATIC void -xfs_efi_item_format(xfs_efi_log_item_t *efip, - xfs_log_iovec_t *log_vector) +xfs_efi_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) { - uint size; + struct xfs_efi_log_item *efip = EFI_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; - ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); + ASSERT(atomic_read(&efip->efi_next_extent) == + efip->efi_format.efi_nextents); efip->efi_format.efi_type = XFS_LI_EFI; - - size = sizeof(xfs_efi_log_format_t); - size += (efip->efi_format.efi_nextents - 1) * sizeof(xfs_extent_t); efip->efi_format.efi_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format); - log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT); - ASSERT(size >= sizeof(xfs_efi_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT, + &efip->efi_format, + xfs_efi_item_sizeof(efip)); } /* * Pinning has no meaning for an efi item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_pin(xfs_efi_log_item_t *efip) +xfs_efi_item_pin( + struct xfs_log_item *lip) { - return; } - /* - * While EFIs cannot really be pinned, the unpin operation is the - * last place at which the EFI is manipulated during a transaction. - * Here we coordinate with xfs_efi_cancel() to determine who gets to - * free the EFI. + * While EFIs cannot really be pinned, the unpin operation is the last place at + * which the EFI is manipulated during a transaction. If we are being asked to + * remove the EFI it's because the transaction has been cancelled and by + * definition that means the EFI cannot be in the AIL so remove it from the + * transaction and free it. Otherwise coordinate with xfs_efi_release() + * to determine who gets to free the EFI. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale) +xfs_efi_item_unpin( + struct xfs_log_item *lip, + int remove) { - xfs_mount_t *mp; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); - if (efip->efi_flags & XFS_EFI_CANCELED) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - AIL_UNLOCK(mp, s); - } -} + struct xfs_efi_log_item *efip = EFI_ITEM(lip); -/* - * like unpin only we have to also clear the xaction descriptor - * pointing the log item if we free the item. This routine duplicates - * unpin because efi_flags is protected by the AIL lock. Freeing - * the descriptor and then calling unpin would force us to drop the AIL - * lock which would open up a race condition. - */ -STATIC void -xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp) -{ - xfs_mount_t *mp; - xfs_log_item_desc_t *lidp; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); - if (efip->efi_flags & XFS_EFI_CANCELED) { - /* - * free the xaction descriptor pointing to this item - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip); - xfs_trans_free_item(tp, lidp); - /* - * pull the item off the AIL. - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); + if (remove) { + ASSERT(!(lip->li_flags & XFS_LI_IN_AIL)); + if (lip->li_desc) + xfs_trans_del_item(lip); xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_COMMITTED; - AIL_UNLOCK(mp, s); + return; } + __xfs_efi_release(efip); } /* - * Efi items have no locking or pushing. However, since EFIs are - * pulled from the AIL when their corresponding EFDs are committed - * to disk, their situation is very similar to being pinned. Return - * XFS_ITEM_PINNED so that the caller will eventually flush the log. - * This should help in getting the EFI out of the AIL. + * Efi items have no locking or pushing. However, since EFIs are pulled from + * the AIL when their corresponding EFDs are committed to disk, their situation + * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller + * will eventually flush the log. This should help in getting the EFI out of + * the AIL. */ -/*ARGSUSED*/ STATIC uint -xfs_efi_item_trylock(xfs_efi_log_item_t *efip) +xfs_efi_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { return XFS_ITEM_PINNED; } -/* - * Efi items have no locking, so just return. - */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_unlock(xfs_efi_log_item_t *efip) +xfs_efi_item_unlock( + struct xfs_log_item *lip) { - if (efip->efi_item.li_flags & XFS_LI_ABORTED) - xfs_efi_item_abort(efip); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efi_item_free(EFI_ITEM(lip)); } /* - * The EFI is logged only once and cannot be moved in the log, so - * simply return the lsn at which it's been logged. The canceled - * flag is not paid any attention here. Checking for that is delayed - * until the EFI is unpinned. + * The EFI is logged only once and cannot be moved in the log, so simply return + * the lsn at which it's been logged. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efi_item_committed(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { return lsn; } /* - * This is called when the transaction logging the EFI is aborted. - * Free up the EFI and return. No need to clean up the slot for - * the item in the transaction. That was done by the unpin code - * which is called prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efi_item_abort(xfs_efi_log_item_t *efip) -{ - xfs_efi_item_free(efip); -} - -/* - * There isn't much you can do to push on an efi item. It is simply - * stuck waiting for all of its corresponding efd items to be - * committed to disk. - */ -/*ARGSUSED*/ -STATIC void -xfs_efi_item_push(xfs_efi_log_item_t *efip) -{ - return; -} - -/* * The EFI dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efi_item_committing(xfs_efi_log_item_t *efip, xfs_lsn_t lsn) +xfs_efi_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efi log items. */ -STATIC struct xfs_item_ops xfs_efi_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efi_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efi_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efi_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efi_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) - xfs_efi_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efi_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efi_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efi_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efi_item_abort, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efi_item_committing +static const struct xfs_item_ops xfs_efi_item_ops = { + .iop_size = xfs_efi_item_size, + .iop_format = xfs_efi_item_format, + .iop_pin = xfs_efi_item_pin, + .iop_unpin = xfs_efi_item_unpin, + .iop_unlock = xfs_efi_item_unlock, + .iop_committed = xfs_efi_item_committed, + .iop_push = xfs_efi_item_push, + .iop_committing = xfs_efi_item_committing }; /* * Allocate and initialize an efi item with the given number of extents. */ -xfs_efi_log_item_t * -xfs_efi_init(xfs_mount_t *mp, - uint nextents) +struct xfs_efi_log_item * +xfs_efi_init( + struct xfs_mount *mp, + uint nextents) { - xfs_efi_log_item_t *efip; + struct xfs_efi_log_item *efip; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFI_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efi_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efip = (xfs_efi_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efip = kmem_zalloc(size, KM_SLEEP); } else { - efip = (xfs_efi_log_item_t*)kmem_zone_zalloc(xfs_efi_zone, - KM_SLEEP); + efip = kmem_zone_zalloc(xfs_efi_zone, KM_SLEEP); } - efip->efi_item.li_type = XFS_LI_EFI; - efip->efi_item.li_ops = &xfs_efi_item_ops; - efip->efi_item.li_mountp = mp; + xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_id = (__psint_t)(void*)efip; + atomic_set(&efip->efi_next_extent, 0); + atomic_set(&efip->efi_refcount, 2); - return (efip); + return efip; } /* @@ -302,7 +257,7 @@ xfs_efi_init(xfs_mount_t *mp, int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) { - xfs_efi_log_format_t *src_efi_fmt = (xfs_efi_log_format_t *)buf->i_addr; + xfs_efi_log_format_t *src_efi_fmt = buf->i_addr; uint i; uint len = sizeof(xfs_efi_log_format_t) + (src_efi_fmt->efi_nextents - 1) * sizeof(xfs_extent_t); @@ -315,8 +270,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) memcpy((char *)dst_efi_fmt, (char*)src_efi_fmt, len); return 0; } else if (buf->i_len == len32) { - xfs_efi_log_format_32_t *src_efi_fmt_32 = - (xfs_efi_log_format_32_t *)buf->i_addr; + xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size; @@ -330,8 +284,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } return 0; } else if (buf->i_len == len64) { - xfs_efi_log_format_64_t *src_efi_fmt_64 = - (xfs_efi_log_format_64_t *)buf->i_addr; + xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr; dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type; dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size; @@ -349,81 +302,38 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) } /* - * This is called by the efd item code below to release references to - * the given efi item. Each efd calls this with the number of - * extents that it has logged, and when the sum of these reaches - * the total number of extents logged by this efi item we can free - * the efi item. - * - * Freeing the efi item requires that we remove it from the AIL. - * We'll use the AIL lock to protect our counters as well as - * the removal from the AIL. + * This is called by the efd item code below to release references to the given + * efi item. Each efd calls this with the number of extents that it has + * logged, and when the sum of these reaches the total number of extents logged + * by this efi item we can free the efi item. */ void xfs_efi_release(xfs_efi_log_item_t *efip, uint nextents) { - xfs_mount_t *mp; - int extents_left; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - ASSERT(efip->efi_next_extent > 0); - ASSERT(efip->efi_flags & XFS_EFI_COMMITTED); - - AIL_LOCK(mp, s); - ASSERT(efip->efi_next_extent >= nextents); - efip->efi_next_extent -= nextents; - extents_left = efip->efi_next_extent; - if (extents_left == 0) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - xfs_efi_item_free(efip); - } else { - AIL_UNLOCK(mp, s); + ASSERT(atomic_read(&efip->efi_next_extent) >= nextents); + if (atomic_sub_and_test(nextents, &efip->efi_next_extent)) { + /* recovery needs us to drop the EFI reference, too */ + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) + __xfs_efi_release(efip); + + __xfs_efi_release(efip); + /* efip may now have been freed, do not reference it again. */ } } -/* - * This is called when the transaction that should be committing the - * EFD corresponding to the given EFI is aborted. The committed and - * canceled flags are used to coordinate the freeing of the EFI and - * the references by the transaction that committed it. - */ -STATIC void -xfs_efi_cancel( - xfs_efi_log_item_t *efip) +static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) { - xfs_mount_t *mp; - SPLDECL(s); - - mp = efip->efi_item.li_mountp; - AIL_LOCK(mp, s); - if (efip->efi_flags & XFS_EFI_COMMITTED) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip, s); - xfs_efi_item_free(efip); - } else { - efip->efi_flags |= XFS_EFI_CANCELED; - AIL_UNLOCK(mp, s); - } + return container_of(lip, struct xfs_efd_log_item, efd_item); } STATIC void -xfs_efd_item_free(xfs_efd_log_item_t *efdp) +xfs_efd_item_free(struct xfs_efd_log_item *efdp) { - int nexts = efdp->efd_format.efd_nextents; - - if (nexts > XFS_EFD_MAX_FAST_EXTENTS) { - kmem_free(efdp, sizeof(xfs_efd_log_item_t) + - (nexts - 1) * sizeof(xfs_extent_t)); - } else { + if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) + kmem_free(efdp); + else kmem_zone_free(xfs_efd_zone, efdp); - } } /* @@ -431,11 +341,22 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp) * We only need 1 iovec for an efd item. It just logs the efd_log_format * structure. */ -/*ARGSUSED*/ -STATIC uint -xfs_efd_item_size(xfs_efd_log_item_t *efdp) +static inline int +xfs_efd_item_sizeof( + struct xfs_efd_log_item *efdp) +{ + return sizeof(xfs_efd_log_format_t) + + (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); +} + +STATIC void +xfs_efd_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - return 1; + *nvecs += 1; + *nbytes += xfs_efd_item_sizeof(EFD_ITEM(lip)); } /* @@ -446,76 +367,61 @@ xfs_efd_item_size(xfs_efd_log_item_t *efdp) * slots in the efd item have been filled. */ STATIC void -xfs_efd_item_format(xfs_efd_log_item_t *efdp, - xfs_log_iovec_t *log_vector) +xfs_efd_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) { - uint size; + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents); efdp->efd_format.efd_type = XFS_LI_EFD; - - size = sizeof(xfs_efd_log_format_t); - size += (efdp->efd_format.efd_nextents - 1) * sizeof(xfs_extent_t); efdp->efd_format.efd_size = 1; - log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format); - log_vector->i_len = size; - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT); - ASSERT(size >= sizeof(xfs_efd_log_format_t)); + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT, + &efdp->efd_format, + xfs_efd_item_sizeof(efdp)); } - /* * Pinning has no meaning for an efd item, so just return. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_pin(xfs_efd_log_item_t *efdp) +xfs_efd_item_pin( + struct xfs_log_item *lip) { - return; } - /* * Since pinning has no meaning for an efd item, unpinning does * not either. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_unpin(xfs_efd_log_item_t *efdp, int stale) +xfs_efd_item_unpin( + struct xfs_log_item *lip, + int remove) { - return; -} - -/*ARGSUSED*/ -STATIC void -xfs_efd_item_unpin_remove(xfs_efd_log_item_t *efdp, xfs_trans_t *tp) -{ - return; } /* - * Efd items have no locking, so just return success. + * There isn't much you can do to push on an efd item. It is simply stuck + * waiting for the log to be flushed to disk. */ -/*ARGSUSED*/ STATIC uint -xfs_efd_item_trylock(xfs_efd_log_item_t *efdp) +xfs_efd_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - return XFS_ITEM_LOCKED; + return XFS_ITEM_PINNED; } -/* - * Efd items have no locking or pushing, so return failure - * so that the caller doesn't bother with us. - */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) +xfs_efd_item_unlock( + struct xfs_log_item *lip) { - if (efdp->efd_item.li_flags & XFS_LI_ABORTED) - xfs_efd_item_abort(efdp); - return; + if (lip->li_flags & XFS_LI_ABORTED) + xfs_efd_item_free(EFD_ITEM(lip)); } /* @@ -525,15 +431,18 @@ xfs_efd_item_unlock(xfs_efd_log_item_t *efdp) * return -1 to keep the transaction code from further referencing * this item. */ -/*ARGSUSED*/ STATIC xfs_lsn_t -xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) +xfs_efd_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { + struct xfs_efd_log_item *efdp = EFD_ITEM(lip); + /* * If we got a log I/O error, it's always the case that the LR with the * EFI got unpinned and freed before the EFD got aborted. */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) + if (!(lip->li_flags & XFS_LI_ABORTED)) xfs_efi_release(efdp->efd_efip, efdp->efd_format.efd_nextents); xfs_efd_item_free(efdp); @@ -541,102 +450,59 @@ xfs_efd_item_committed(xfs_efd_log_item_t *efdp, xfs_lsn_t lsn) } /* - * The transaction of which this EFD is a part has been aborted. - * Inform its companion EFI of this fact and then clean up after - * ourselves. No need to clean up the slot for the item in the - * transaction. That was done by the unpin code which is called - * prior to this routine in the abort/fs-shutdown path. - */ -STATIC void -xfs_efd_item_abort(xfs_efd_log_item_t *efdp) -{ - /* - * If we got a log I/O error, it's always the case that the LR with the - * EFI got unpinned and freed before the EFD got aborted. So don't - * reference the EFI at all in that case. - */ - if ((efdp->efd_item.li_flags & XFS_LI_ABORTED) == 0) - xfs_efi_cancel(efdp->efd_efip); - - xfs_efd_item_free(efdp); -} - -/* - * There isn't much you can do to push on an efd item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -/*ARGSUSED*/ -STATIC void -xfs_efd_item_push(xfs_efd_log_item_t *efdp) -{ - return; -} - -/* * The EFD dependency tracking op doesn't do squat. It can't because * it doesn't know where the free extent is coming from. The dependency * tracking has to be handled by the "enclosing" metadata object. For * example, for inodes, the inode is locked throughout the extent freeing * so the dependency should be recorded there. */ -/*ARGSUSED*/ STATIC void -xfs_efd_item_committing(xfs_efd_log_item_t *efip, xfs_lsn_t lsn) +xfs_efd_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) { - return; } /* * This is the ops vector shared by all efd log items. */ -STATIC struct xfs_item_ops xfs_efd_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_efd_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_efd_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_efd_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_efd_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_efd_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_efd_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_efd_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_efd_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_efd_item_abort, - .iop_pushbuf = NULL, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_efd_item_committing +static const struct xfs_item_ops xfs_efd_item_ops = { + .iop_size = xfs_efd_item_size, + .iop_format = xfs_efd_item_format, + .iop_pin = xfs_efd_item_pin, + .iop_unpin = xfs_efd_item_unpin, + .iop_unlock = xfs_efd_item_unlock, + .iop_committed = xfs_efd_item_committed, + .iop_push = xfs_efd_item_push, + .iop_committing = xfs_efd_item_committing }; - /* * Allocate and initialize an efd item with the given number of extents. */ -xfs_efd_log_item_t * -xfs_efd_init(xfs_mount_t *mp, - xfs_efi_log_item_t *efip, - uint nextents) +struct xfs_efd_log_item * +xfs_efd_init( + struct xfs_mount *mp, + struct xfs_efi_log_item *efip, + uint nextents) { - xfs_efd_log_item_t *efdp; + struct xfs_efd_log_item *efdp; uint size; ASSERT(nextents > 0); if (nextents > XFS_EFD_MAX_FAST_EXTENTS) { size = (uint)(sizeof(xfs_efd_log_item_t) + ((nextents - 1) * sizeof(xfs_extent_t))); - efdp = (xfs_efd_log_item_t*)kmem_zalloc(size, KM_SLEEP); + efdp = kmem_zalloc(size, KM_SLEEP); } else { - efdp = (xfs_efd_log_item_t*)kmem_zone_zalloc(xfs_efd_zone, - KM_SLEEP); + efdp = kmem_zone_zalloc(xfs_efd_zone, KM_SLEEP); } - efdp->efd_item.li_type = XFS_LI_EFD; - efdp->efd_item.li_ops = &xfs_efd_item_ops; - efdp->efd_item.li_mountp = mp; + xfs_log_item_init(mp, &efdp->efd_item, XFS_LI_EFD, &xfs_efd_item_ops); efdp->efd_efip = efip; efdp->efd_format.efd_nextents = nextents; efdp->efd_format.efd_efi_id = efip->efi_format.efi_id; - return (efdp); + return efdp; } diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 0ea45edaab0..0ffbce32d56 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -18,115 +18,36 @@ #ifndef __XFS_EXTFREE_ITEM_H__ #define __XFS_EXTFREE_ITEM_H__ +/* kernel only EFI/EFD definitions */ + struct xfs_mount; struct kmem_zone; -typedef struct xfs_extent { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; -} xfs_extent_t; - -/* - * Since an xfs_extent_t has types (start:64, len: 32) - * there are different alignments on 32 bit and 64 bit kernels. - * So we provide the different variants for use by a - * conversion routine. - */ - -typedef struct xfs_extent_32 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; -} __attribute__((packed)) xfs_extent_32_t; - -typedef struct xfs_extent_64 { - xfs_dfsbno_t ext_start; - xfs_extlen_t ext_len; - __uint32_t ext_pad; -} xfs_extent_64_t; - -/* - * This is the structure used to lay out an efi log item in the - * log. The efi_extents field is a variable size array whose - * size is given by efi_nextents. - */ -typedef struct xfs_efi_log_format { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_t; - -typedef struct xfs_efi_log_format_32 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_32_t efi_extents[1]; /* array of extents to free */ -} __attribute__((packed)) xfs_efi_log_format_32_t; - -typedef struct xfs_efi_log_format_64 { - unsigned short efi_type; /* efi log item type */ - unsigned short efi_size; /* size of this item */ - uint efi_nextents; /* # extents to free */ - __uint64_t efi_id; /* efi identifier */ - xfs_extent_64_t efi_extents[1]; /* array of extents to free */ -} xfs_efi_log_format_64_t; - -/* - * This is the structure used to lay out an efd log item in the - * log. The efd_extents array is a variable size array whose - * size is given by efd_nextents; - */ -typedef struct xfs_efd_log_format { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_t; - -typedef struct xfs_efd_log_format_32 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_32_t efd_extents[1]; /* array of extents freed */ -} __attribute__((packed)) xfs_efd_log_format_32_t; - -typedef struct xfs_efd_log_format_64 { - unsigned short efd_type; /* efd log item type */ - unsigned short efd_size; /* size of this item */ - uint efd_nextents; /* # of extents freed */ - __uint64_t efd_efi_id; /* id of corresponding efi */ - xfs_extent_64_t efd_extents[1]; /* array of extents freed */ -} xfs_efd_log_format_64_t; - - -#ifdef __KERNEL__ - /* * Max number of extents in fast allocation path. */ #define XFS_EFI_MAX_FAST_EXTENTS 16 /* - * Define EFI flags. + * Define EFI flag bits. Manipulated by set/clear/test_bit operators. */ -#define XFS_EFI_RECOVERED 0x1 -#define XFS_EFI_COMMITTED 0x2 -#define XFS_EFI_CANCELED 0x4 +#define XFS_EFI_RECOVERED 1 /* - * This is the "extent free intention" log item. It is used - * to log the fact that some extents need to be free. It is - * used in conjunction with the "extent free done" log item - * described below. + * This is the "extent free intention" log item. It is used to log the fact + * that some extents need to be free. It is used in conjunction with the + * "extent free done" log item described below. + * + * The EFI is reference counted so that it is not freed prior to both the EFI + * and EFD being committed and unpinned. This ensures that when the last + * reference goes away the EFI will always be in the AIL as it has been + * unpinned, regardless of whether the EFD is processed before or after the EFI. */ typedef struct xfs_efi_log_item { xfs_log_item_t efi_item; - uint efi_flags; /* misc flags */ - uint efi_next_extent; + atomic_t efi_refcount; + atomic_t efi_next_extent; + unsigned long efi_flags; /* misc flags */ xfs_efi_log_format_t efi_format; } xfs_efi_log_item_t; @@ -157,6 +78,4 @@ int xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt); void xfs_efi_item_free(xfs_efi_log_item_t *); -#endif /* __KERNEL__ */ - #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c new file mode 100644 index 00000000000..1f66779d7a4 --- /dev/null +++ b/fs/xfs/xfs_file.c @@ -0,0 +1,1433 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_dir2.h" +#include "xfs_dir2_priv.h" +#include "xfs_ioctl.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" + +#include <linux/aio.h> +#include <linux/dcache.h> +#include <linux/falloc.h> +#include <linux/pagevec.h> + +static const struct vm_operations_struct xfs_file_vm_ops; + +/* + * Locking primitives for read and write IO paths to ensure we consistently use + * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. + */ +static inline void +xfs_rw_ilock( + struct xfs_inode *ip, + int type) +{ + if (type & XFS_IOLOCK_EXCL) + mutex_lock(&VFS_I(ip)->i_mutex); + xfs_ilock(ip, type); +} + +static inline void +xfs_rw_iunlock( + struct xfs_inode *ip, + int type) +{ + xfs_iunlock(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +static inline void +xfs_rw_ilock_demote( + struct xfs_inode *ip, + int type) +{ + xfs_ilock_demote(ip, type); + if (type & XFS_IOLOCK_EXCL) + mutex_unlock(&VFS_I(ip)->i_mutex); +} + +/* + * xfs_iozero + * + * xfs_iozero clears the specified range of buffer supplied, + * and marks all the affected blocks as valid and modified. If + * an affected block is not allocated, it will be allocated. If + * an affected block is not completely overwritten, and is not + * valid before the operation, it will be read from disk before + * being partially zeroed. + */ +int +xfs_iozero( + struct xfs_inode *ip, /* inode */ + loff_t pos, /* offset in file */ + size_t count) /* size of data to zero */ +{ + struct page *page; + struct address_space *mapping; + int status; + + mapping = VFS_I(ip)->i_mapping; + do { + unsigned offset, bytes; + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; + + zero_user(page, offset, bytes); + + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + pos += bytes; + count -= bytes; + status = 0; + } while (count); + + return (-status); +} + +/* + * Fsync operations on directories are much simpler than on regular files, + * as there is no file data to flush, and thus also no need for explicit + * cache flush operations, and there are no non-transaction metadata updates + * on directories either. + */ +STATIC int +xfs_dir_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct xfs_inode *ip = XFS_I(file->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + xfs_lsn_t lsn = 0; + + trace_xfs_dir_fsync(ip); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) + lsn = ip->i_itemp->ili_last_lsn; + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (!lsn) + return 0; + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); +} + +STATIC int +xfs_file_fsync( + struct file *file, + loff_t start, + loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + int error = 0; + int log_flushed = 0; + xfs_lsn_t lsn = 0; + + trace_xfs_file_fsync(ip); + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); + + if (mp->m_flags & XFS_MOUNT_BARRIER) { + /* + * If we have an RT and/or log subvolume we need to make sure + * to flush the write cache the device used for file data + * first. This is to ensure newly written file data make + * it to disk before logging the new inode size in case of + * an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + } + + /* + * All metadata updates are logged, which means that we just have + * to flush the log up to the latest LSN that touched the inode. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_ipincount(ip)) { + if (!datasync || + (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) + lsn = ip->i_itemp->ili_last_lsn; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (lsn) + error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + + /* + * If we only have a single device, and the log force about was + * a no-op we might have to flush the data device cache here. + * This can only happen for fdatasync/O_DSYNC if we were overwriting + * an already allocated file and thus do not have any metadata to + * commit. + */ + if ((mp->m_flags & XFS_MOUNT_BARRIER) && + mp->m_logdev_targp == mp->m_ddev_targp && + !XFS_IS_REALTIME_INODE(ip) && + !log_flushed) + xfs_blkdev_issue_flush(mp->m_ddev_targp); + + return -error; +} + +STATIC ssize_t +xfs_file_read_iter( + struct kiocb *iocb, + struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + size_t size = iov_iter_count(to); + ssize_t ret = 0; + int ioflags = 0; + xfs_fsize_t n; + loff_t pos = iocb->ki_pos; + + XFS_STATS_INC(xs_read_calls); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + if (file->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + if (unlikely(ioflags & IO_ISDIRECT)) { + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + /* DIO must be aligned to device logical sector size */ + if ((pos | size) & target->bt_logical_sectormask) { + if (pos == i_size_read(inode)) + return 0; + return -XFS_ERROR(EINVAL); + } + } + + n = mp->m_super->s_maxbytes - pos; + if (n <= 0 || size == 0) + return 0; + + if (n < size) + size = n; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + /* + * Locking is a bit tricky here. If we take an exclusive lock + * for direct IO, we effectively serialise all new concurrent + * read IO to this file and block it behind IO that is currently in + * progress because IO in progress holds the IO lock shared. We only + * need to hold the lock exclusive to blow away the page cache, so + * only take lock exclusively if the page cache needs invalidation. + * This allows the normal direct IO case of no page cache pages to + * proceeed concurrently without serialisation. + */ + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); + + if (inode->i_mapping->nrpages) { + ret = filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + pos, -1); + if (ret) { + xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); + return ret; + } + truncate_pagecache_range(VFS_I(ip), pos, -1); + } + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + } + + trace_xfs_file_read(ip, size, pos, ioflags); + + ret = generic_file_read_iter(iocb, to); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + unsigned int flags) +{ + struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); + int ioflags = 0; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + + if (infilp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + + trace_xfs_file_splice_read(ip, count, *ppos, ioflags); + + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); + + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +/* + * This routine is called to handle zeroing any space in the last block of the + * file that is beyond the EOF. We do this since the size is being increased + * without writing anything to that block and we don't want to read the + * garbage on the disk. + */ +STATIC int /* error (positive) */ +xfs_zero_last_block( + struct xfs_inode *ip, + xfs_fsize_t offset, + xfs_fsize_t isize) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); + int zero_offset = XFS_B_FSB_OFFSET(mp, isize); + int zero_len; + int nimaps = 1; + int error = 0; + struct xfs_bmbt_irec imap; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + /* + * If the block underlying isize is just a hole, then there + * is nothing to zero. + */ + if (imap.br_startblock == HOLESTARTBLOCK) + return 0; + + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; + return xfs_iozero(ip, isize, zero_len); +} + +/* + * Zero any on disk space between the current EOF and the new, larger EOF. + * + * This handles the normal case of zeroing the remainder of the last block in + * the file and the unusual case of zeroing blocks out beyond the size of the + * file. This second case only happens with fixed size extents and when the + * system crashes before the inode size was updated but after blocks were + * allocated. + * + * Expects the iolock to be held exclusive, and will take the ilock internally. + */ +int /* error (positive) */ +xfs_zero_eof( + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ + xfs_fsize_t isize) /* current inode size */ +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; + xfs_fileoff_t end_zero_fsb; + xfs_fileoff_t zero_count_fsb; + xfs_fileoff_t last_fsb; + xfs_fileoff_t zero_off; + xfs_fsize_t zero_len; + int nimaps; + int error = 0; + struct xfs_bmbt_irec imap; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(offset > isize); + + /* + * First handle zeroing the block on which isize resides. + * + * We only zero a part of that block so it is handled specially. + */ + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { + error = xfs_zero_last_block(ip, offset, isize); + if (error) + return error; + } + + /* + * Calculate the range between the new size and the old where blocks + * needing to be zeroed may exist. + * + * To get the block where the last byte in the file currently resides, + * we need to subtract one from the size and truncate back to a block + * boundary. We subtract 1 in case the size is exactly on a block + * boundary. + */ + last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; + start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); + end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); + ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); + if (last_fsb == end_zero_fsb) { + /* + * The size was only incremented on its last block. + * We took care of that above, so just return. + */ + return 0; + } + + ASSERT(start_zero_fsb <= end_zero_fsb); + while (start_zero_fsb <= end_zero_fsb) { + nimaps = 1; + zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb, + &imap, &nimaps, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + ASSERT(nimaps > 0); + + if (imap.br_state == XFS_EXT_UNWRITTEN || + imap.br_startblock == HOLESTARTBLOCK) { + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + continue; + } + + /* + * There are blocks we need to zero. + */ + zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); + zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); + + if ((zero_off + zero_len) > offset) + zero_len = offset - zero_off; + + error = xfs_iozero(ip, zero_off, zero_len); + if (error) + return error; + + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + } + + return 0; +} + +/* + * Common pre-write limit and setup checks. + * + * Called with the iolocked held either shared and exclusive according to + * @iolock, and returns with it held. Might upgrade the iolock to exclusive + * if called for a direct write beyond i_size. + */ +STATIC ssize_t +xfs_file_aio_write_checks( + struct file *file, + loff_t *pos, + size_t *count, + int *iolock) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + +restart: + error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); + if (error) + return error; + + /* + * If the offset is beyond the size of the file, we need to zero any + * blocks that fall between the existing EOF and the start of this + * write. If zeroing is needed and we are currently holding the + * iolock shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*pos > i_size_read(inode)) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } + error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); + if (error) + return error; + } + + /* + * Updating the timestamps will grab the ilock again from + * xfs_fs_dirty_inode, so we have to call it after dropping the + * lock above. Eventually we should look into a way to avoid + * the pointless lock roundtrip. + */ + if (likely(!(file->f_mode & FMODE_NOCMTIME))) { + error = file_update_time(file); + if (error) + return error; + } + + /* + * If we're writing the file then make sure to clear the setuid and + * setgid bits if the process is not being run by root. This keeps + * people from modifying setuid and setgid binaries. + */ + return file_remove_suid(file); +} + +/* + * xfs_file_dio_aio_write - handle direct IO writes + * + * Lock the inode appropriately to prepare for and issue a direct IO write. + * By separating it from the buffered write path we remove all the tricky to + * follow locking changes and looping. + * + * If there are cached pages or we're extending the file, we need IOLOCK_EXCL + * until we're sure the bytes at the new EOF have been zeroed and/or the cached + * pages are flushed out. + * + * In most cases the direct IO writes will be done holding IOLOCK_SHARED + * allowing them to be done in parallel with reads and other direct IO writes. + * However, if the IO is not aligned to filesystem blocks, the direct IO layer + * needs to do sub-block zeroing and that requires serialisation against other + * direct IOs to the same block. In this case we need to serialise the + * submission of the unaligned IOs so that we don't get racing block zeroing in + * the dio layer. To avoid the problem with aio, we also need to wait for + * outstanding IOs to complete so that unwritten extent conversion is completed + * before we try to map the overlapping block. This is currently implemented by + * hitting it with a big hammer (i.e. inode_dio_wait()). + * + * Returns with locks held indicated by @iolock and errors indicated by + * negative return values. + */ +STATIC ssize_t +xfs_file_dio_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + ssize_t ret = 0; + int unaligned_io = 0; + int iolock; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + /* DIO must be aligned to device logical sector size */ + if ((pos | count) & target->bt_logical_sectormask) + return -XFS_ERROR(EINVAL); + + /* "unaligned" here means not aligned to a filesystem block */ + if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) + unaligned_io = 1; + + /* + * We don't need to take an exclusive lock unless there page cache needs + * to be invalidated or unaligned IO is being executed. We don't need to + * consider the EOF extension case here because + * xfs_file_aio_write_checks() will relock the inode as necessary for + * EOF zeroing cases and fill out the new inode size as appropriate. + */ + if (unaligned_io || mapping->nrpages) + iolock = XFS_IOLOCK_EXCL; + else + iolock = XFS_IOLOCK_SHARED; + xfs_rw_ilock(ip, iolock); + + /* + * Recheck if there are cached pages that need invalidate after we got + * the iolock to protect against other threads adding new pages while + * we were waiting for the iolock. + */ + if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, iolock); + iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, iolock); + } + + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + if (ret) + goto out; + iov_iter_truncate(from, count); + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + pos, -1); + if (ret) + goto out; + truncate_pagecache_range(VFS_I(ip), pos, -1); + } + + /* + * If we are doing unaligned IO, wait for all other IO to drain, + * otherwise demote the lock if we had to flush cached pages + */ + if (unaligned_io) + inode_dio_wait(inode); + else if (iolock == XFS_IOLOCK_EXCL) { + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); + ret = generic_file_direct_write(iocb, from, pos); + +out: + xfs_rw_iunlock(ip, iolock); + + /* No fallback to buffered IO on errors for XFS. */ + ASSERT(ret < 0 || ret == count); + return ret; +} + +STATIC ssize_t +xfs_file_buffered_aio_write( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + int enospc = 0; + int iolock = XFS_IOLOCK_EXCL; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); + + xfs_rw_ilock(ip, iolock); + + ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + if (ret) + goto out; + + iov_iter_truncate(from, count); + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + +write_retry: + trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); + ret = generic_perform_write(file, from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; + /* + * If we just got an ENOSPC, try to write back all dirty inodes to + * convert delalloc space to free up some of the excess reserved + * metadata space. + */ + if (ret == -ENOSPC && !enospc) { + enospc = 1; + xfs_flush_inodes(ip->i_mount); + goto write_retry; + } + + current->backing_dev_info = NULL; +out: + xfs_rw_iunlock(ip, iolock); + return ret; +} + +STATIC ssize_t +xfs_file_write_iter( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct xfs_inode *ip = XFS_I(inode); + ssize_t ret; + size_t ocount = iov_iter_count(from); + + XFS_STATS_INC(xs_write_calls); + + if (ocount == 0) + return 0; + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (unlikely(file->f_flags & O_DIRECT)) + ret = xfs_file_dio_aio_write(iocb, from); + else + ret = xfs_file_buffered_aio_write(iocb, from); + + if (ret > 0) { + ssize_t err; + + XFS_STATS_ADD(xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; +} + +STATIC long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_trans *tp; + long error; + loff_t new_size = 0; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + return -EOPNOTSUPP; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (mode & FALLOC_FL_PUNCH_HOLE) { + error = xfs_free_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + if (offset & blksize_mask || len & blksize_mask) { + error = EINVAL; + goto out_unlock; + } + + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = EINVAL; + goto out_unlock; + } + + new_size = i_size_read(inode) - len; + + error = xfs_collapse_file_space(ip, offset, len); + if (error) + goto out_unlock; + } else { + if (!(mode & FALLOC_FL_KEEP_SIZE) && + offset + len > i_size_read(inode)) { + new_size = offset + len; + error = -inode_newsize_ok(inode, new_size); + if (error) + goto out_unlock; + } + + if (mode & FALLOC_FL_ZERO_RANGE) + error = xfs_zero_file_space(ip, offset, len); + else + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_PREALLOC); + if (error) + goto out_unlock; + } + + tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE))) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (file->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + if (error) + goto out_unlock; + + /* Change file size if needed */ + if (new_size) { + struct iattr iattr; + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = new_size; + error = xfs_setattr_size(ip, &iattr); + } + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; +} + + +STATIC int +xfs_file_open( + struct inode *inode, + struct file *file) +{ + if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) + return -EFBIG; + if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + return -EIO; + return 0; +} + +STATIC int +xfs_dir_open( + struct inode *inode, + struct file *file) +{ + struct xfs_inode *ip = XFS_I(inode); + int mode; + int error; + + error = xfs_file_open(inode, file); + if (error) + return error; + + /* + * If there are any blocks, read-ahead block 0 as we're almost + * certain to have the next operation be a read there. + */ + mode = xfs_ilock_data_map_shared(ip); + if (ip->i_d.di_nextents > 0) + xfs_dir3_data_readahead(ip, 0, -1); + xfs_iunlock(ip, mode); + return 0; +} + +STATIC int +xfs_file_release( + struct inode *inode, + struct file *filp) +{ + return -xfs_release(XFS_I(inode)); +} + +STATIC int +xfs_file_readdir( + struct file *file, + struct dir_context *ctx) +{ + struct inode *inode = file_inode(file); + xfs_inode_t *ip = XFS_I(inode); + int error; + size_t bufsize; + + /* + * The Linux API doesn't pass down the total size of the buffer + * we read into down to the filesystem. With the filldir concept + * it's not needed for correct information, but the XFS dir2 leaf + * code wants an estimate of the buffer size to calculate it's + * readahead window and size the buffers used for mapping to + * physical blocks. + * + * Try to give it an estimate that's good enough, maybe at some + * point we can change the ->readdir prototype to include the + * buffer size. For now we use the current glibc buffer size. + */ + bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size); + + error = xfs_readdir(ip, ctx, bufsize); + if (error) + return -error; + return 0; +} + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + vma->vm_ops = &xfs_file_vm_ops; + + file_accessed(filp); + return 0; +} + +/* + * mmap()d file has taken write protection fault and is being made + * writable. We can set the page state up correctly for a writable + * page, which means we can do correct delalloc accounting (ENOSPC + * checking!) and unwritten extent mapping. + */ +STATIC int +xfs_vm_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + return block_page_mkwrite(vma, vmf, xfs_get_blocks); +} + +/* + * This type is designed to indicate the type of offset we would like + * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + */ +enum { + HOLE_OFF = 0, + DATA_OFF, +}; + +/* + * Lookup the desired type of offset from the given page. + * + * On success, return true and the offset argument will point to the + * start of the region that was found. Otherwise this function will + * return false and keep the offset argument unchanged. + */ +STATIC bool +xfs_lookup_buffer_offset( + struct page *page, + loff_t *offset, + unsigned int type) +{ + loff_t lastoff = page_offset(page); + bool found = false; + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + /* + * Unwritten extents that have data in the page + * cache covering them can be identified by the + * BH_Unwritten state flag. Pages with multiple + * buffers might have a mix of holes, data and + * unwritten extents - any buffer with valid + * data in it should have BH_Uptodate flag set + * on it. + */ + if (buffer_unwritten(bh) || + buffer_uptodate(bh)) { + if (type == DATA_OFF) + found = true; + } else { + if (type == HOLE_OFF) + found = true; + } + + if (found) { + *offset = lastoff; + break; + } + lastoff += bh->b_size; + } while ((bh = bh->b_this_page) != head); + + return found; +} + +/* + * This routine is called to find out and return a data or hole offset + * from the page cache for unwritten extents according to the desired + * type for xfs_seek_data() or xfs_seek_hole(). + * + * The argument offset is used to tell where we start to search from the + * page cache. Map is used to figure out the end points of the range to + * lookup pages. + * + * Return true if the desired type of offset was found, and the argument + * offset is filled with that address. Otherwise, return false and keep + * offset unchanged. + */ +STATIC bool +xfs_find_get_desired_pgoff( + struct inode *inode, + struct xfs_bmbt_irec *map, + unsigned int type, + loff_t *offset) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct pagevec pvec; + pgoff_t index; + pgoff_t end; + loff_t endoff; + loff_t startoff = *offset; + loff_t lastoff = startoff; + bool found = false; + + pagevec_init(&pvec, 0); + + index = startoff >> PAGE_CACHE_SHIFT; + endoff = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount); + end = endoff >> PAGE_CACHE_SHIFT; + do { + int want; + unsigned nr_pages; + unsigned int i; + + want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* + * No page mapped into given range. If we are searching holes + * and if this is the first time we got into the loop, it means + * that the given offset is landed in a hole, return it. + * + * If we have already stepped through some block buffers to find + * holes but they all contains data. In this case, the last + * offset is already updated and pointed to the end of the last + * mapped page, if it does not reach the endpoint to search, + * that means there should be a hole between them. + */ + if (nr_pages == 0) { + /* Data search found nothing */ + if (type == DATA_OFF) + break; + + ASSERT(type == HOLE_OFF); + if (lastoff == startoff || lastoff < endoff) { + found = true; + *offset = lastoff; + } + break; + } + + /* + * At lease we found one page. If this is the first time we + * step into the loop, and if the first page index offset is + * greater than the given search offset, a hole was found. + */ + if (type == HOLE_OFF && lastoff == startoff && + lastoff < page_offset(pvec.pages[0])) { + found = true; + break; + } + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; + + /* + * At this point, the page may be truncated or + * invalidated (changing page->mapping to NULL), + * or even swizzled back from swapper_space to tmpfs + * file mapping. However, page->index will not change + * because we have a reference on the page. + * + * Searching done if the page index is out of range. + * If the current offset is not reaches the end of + * the specified search range, there should be a hole + * between them. + */ + if (page->index > end) { + if (type == HOLE_OFF && lastoff < endoff) { + *offset = lastoff; + found = true; + } + goto out; + } + + lock_page(page); + /* + * Page truncated or invalidated(page->mapping == NULL). + * We can freely skip it and proceed to check the next + * page. + */ + if (unlikely(page->mapping != inode->i_mapping)) { + unlock_page(page); + continue; + } + + if (!page_has_buffers(page)) { + unlock_page(page); + continue; + } + + found = xfs_lookup_buffer_offset(page, &b_offset, type); + if (found) { + /* + * The found offset may be less than the start + * point to search if this is the first time to + * come here. + */ + *offset = max_t(loff_t, startoff, b_offset); + unlock_page(page); + goto out; + } + + /* + * We either searching data but nothing was found, or + * searching hole but found a data buffer. In either + * case, probably the next page contains the desired + * things, update the last offset to it so. + */ + lastoff = page_offset(page) + PAGE_SIZE; + unlock_page(page); + } + + /* + * The number of returned pages less than our desired, search + * done. In this case, nothing was found for searching data, + * but we found a hole behind the last offset. + */ + if (nr_pages < want) { + if (type == HOLE_OFF) { + *offset = lastoff; + found = true; + } + break; + } + + index = pvec.pages[i - 1]->index + 1; + pagevec_release(&pvec); + } while (index <= end); + +out: + pagevec_release(&pvec); + return found; +} + +STATIC loff_t +xfs_seek_data( + struct file *file, + loff_t start) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + lock = xfs_ilock_data_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + /* + * Try to read extents from the first block indicated + * by fsbno to the end block of the file. + */ + fsbno = XFS_B_TO_FSBT(mp, start); + end = XFS_B_TO_FSB(mp, isize); + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a data extent */ + if (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock))) + goto out; + + /* + * Landed in an unwritten extent, try to search data + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + DATA_OFF, &offset)) + goto out; + } + } + + /* + * map[0] is hole or its an unwritten extent but + * without data in page cache. Probably means that + * we are reading after EOF if nothing in map[1]. + */ + if (nmap == 1) { + error = ENXIO; + goto out_unlock; + } + + ASSERT(i > 1); + + /* + * Nothing was found, proceed to the next round of search + * if reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + } + +out: + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out_unlock: + xfs_iunlock(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_seek_hole( + struct file *file, + loff_t start) +{ + struct inode *inode = file->f_mapping->host; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + loff_t uninitialized_var(offset); + xfs_fsize_t isize; + xfs_fileoff_t fsbno; + xfs_filblks_t end; + uint lock; + int error; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + lock = xfs_ilock_data_map_shared(ip); + + isize = i_size_read(inode); + if (start >= isize) { + error = ENXIO; + goto out_unlock; + } + + fsbno = XFS_B_TO_FSBT(mp, start); + end = XFS_B_TO_FSB(mp, isize); + + for (;;) { + struct xfs_bmbt_irec map[2]; + int nmap = 2; + unsigned int i; + + error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, + XFS_BMAPI_ENTIRE); + if (error) + goto out_unlock; + + /* No extents at given offset, must be beyond EOF */ + if (nmap == 0) { + error = ENXIO; + goto out_unlock; + } + + for (i = 0; i < nmap; i++) { + offset = max_t(loff_t, start, + XFS_FSB_TO_B(mp, map[i].br_startoff)); + + /* Landed in a hole */ + if (map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* + * Landed in an unwritten extent, try to search hole + * from page cache. + */ + if (map[i].br_state == XFS_EXT_UNWRITTEN) { + if (xfs_find_get_desired_pgoff(inode, &map[i], + HOLE_OFF, &offset)) + goto out; + } + } + + /* + * map[0] contains data or its unwritten but contains + * data in page cache, probably means that we are + * reading after EOF. We should fix offset to point + * to the end of the file(i.e., there is an implicit + * hole at the end of any file). + */ + if (nmap == 1) { + offset = isize; + break; + } + + ASSERT(i > 1); + + /* + * Both mappings contains data, proceed to the next round of + * search if the current reading offset not beyond or hit EOF. + */ + fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; + start = XFS_FSB_TO_B(mp, fsbno); + if (start >= isize) { + offset = isize; + break; + } + } + +out: + /* + * At this point, we must have found a hole. However, the returned + * offset may be bigger than the file size as it may be aligned to + * page boundary for unwritten extents, we need to deal with this + * situation in particular. + */ + offset = min_t(loff_t, offset, isize); + offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + +out_unlock: + xfs_iunlock(ip, lock); + + if (error) + return -error; + return offset; +} + +STATIC loff_t +xfs_file_llseek( + struct file *file, + loff_t offset, + int origin) +{ + switch (origin) { + case SEEK_END: + case SEEK_CUR: + case SEEK_SET: + return generic_file_llseek(file, offset, origin); + case SEEK_DATA: + return xfs_seek_data(file, offset); + case SEEK_HOLE: + return xfs_seek_hole(file, offset); + default: + return -EINVAL; + } +} + +const struct file_operations xfs_file_operations = { + .llseek = xfs_file_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = xfs_file_read_iter, + .write_iter = xfs_file_write_iter, + .splice_read = xfs_file_splice_read, + .splice_write = iter_file_splice_write, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .mmap = xfs_file_mmap, + .open = xfs_file_open, + .release = xfs_file_release, + .fsync = xfs_file_fsync, + .fallocate = xfs_file_fallocate, +}; + +const struct file_operations xfs_dir_file_operations = { + .open = xfs_dir_open, + .read = generic_read_dir, + .iterate = xfs_file_readdir, + .llseek = generic_file_llseek, + .unlocked_ioctl = xfs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = xfs_file_compat_ioctl, +#endif + .fsync = xfs_dir_fsync, +}; + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, +}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c new file mode 100644 index 00000000000..8ec81bed799 --- /dev/null +++ b/fs/xfs/xfs_filestream.c @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * Copyright (c) 2014 Christoph Hellwig. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_inum.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_alloc.h" +#include "xfs_mru_cache.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" +#include "xfs_trace.h" + +struct xfs_fstrm_item { + struct xfs_mru_cache_elem mru; + struct xfs_inode *ip; + xfs_agnumber_t ag; /* AG in use for this directory */ +}; + +enum xfs_fstrm_alloc { + XFS_PICK_USERDATA = 1, + XFS_PICK_LOWSPACE = 2, +}; + +/* + * Allocation group filestream associations are tracked with per-ag atomic + * counters. These counters allow xfs_filestream_pick_ag() to tell whether a + * particular AG already has active filestreams associated with it. The mount + * point's m_peraglock is used to protect these counters from per-ag array + * re-allocation during a growfs operation. When xfs_growfs_data_private() is + * about to reallocate the array, it calls xfs_filestream_flush() with the + * m_peraglock held in write mode. + * + * Since xfs_mru_cache_flush() guarantees that all the free functions for all + * the cache elements have finished executing before it returns, it's safe for + * the free functions to use the atomic counters without m_peraglock protection. + * This allows the implementation of xfs_fstrm_free_func() to be agnostic about + * whether it was called with the m_peraglock held in read mode, write mode or + * not held at all. The race condition this addresses is the following: + * + * - The work queue scheduler fires and pulls a filestream directory cache + * element off the LRU end of the cache for deletion, then gets pre-empted. + * - A growfs operation grabs the m_peraglock in write mode, flushes all the + * remaining items from the cache and reallocates the mount point's per-ag + * array, resetting all the counters to zero. + * - The work queue thread resumes and calls the free function for the element + * it started cleaning up earlier. In the process it decrements the + * filestreams counter for an AG that now has no references. + * + * With a shrinkfs feature, the above scenario could panic the system. + * + * All other uses of the following macros should be protected by either the + * m_peraglock held in read mode, or the cache's internal locking exposed by the + * interval between a call to xfs_mru_cache_lookup() and a call to + * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode + * when new elements are added to the cache. + * + * Combined, these locking rules ensure that no associations will ever exist in + * the cache that reference per-ag array elements that have since been + * reallocated. + */ +int +xfs_filestream_peek_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_read(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static int +xfs_filestream_get_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ret; + + pag = xfs_perag_get(mp, agno); + ret = atomic_inc_return(&pag->pagf_fstrms); + xfs_perag_put(pag); + return ret; +} + +static void +xfs_filestream_put_ag( + xfs_mount_t *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + atomic_dec(&pag->pagf_fstrms); + xfs_perag_put(pag); +} + +static void +xfs_fstrm_free_func( + struct xfs_mru_cache_elem *mru) +{ + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + + xfs_filestream_put_ag(item->ip->i_mount, item->ag); + + trace_xfs_filestream_free(item->ip, item->ag); + + kmem_free(item); +} + +/* + * Scan the AGs starting at startag looking for an AG that isn't in use and has + * at least minlen blocks free. + */ +static int +xfs_filestream_pick_ag( + struct xfs_inode *ip, + xfs_agnumber_t startag, + xfs_agnumber_t *agp, + int flags, + xfs_extlen_t minlen) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_fstrm_item *item; + struct xfs_perag *pag; + xfs_extlen_t longest, free = 0, minfree, maxfree = 0; + xfs_agnumber_t ag, max_ag = NULLAGNUMBER; + int err, trylock, nscan; + + ASSERT(S_ISDIR(ip->i_d.di_mode)); + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + ag = startag; + *agp = NULLAGNUMBER; + + /* For the first pass, don't sleep trying to init the per-AG. */ + trylock = XFS_ALLOC_FLAG_TRYLOCK; + + for (nscan = 0; 1; nscan++) { + trace_xfs_filestream_scan(ip, ag); + + pag = xfs_perag_get(mp, ag); + + if (!pag->pagf_init) { + err = xfs_alloc_pagf_init(mp, NULL, ag, trylock); + if (err && !trylock) { + xfs_perag_put(pag); + return err; + } + } + + /* Might fail sometimes during the 1st pass with trylock set. */ + if (!pag->pagf_init) + goto next_ag; + + /* Keep track of the AG with the most free blocks. */ + if (pag->pagf_freeblks > maxfree) { + maxfree = pag->pagf_freeblks; + max_ag = ag; + } + + /* + * The AG reference count does two things: it enforces mutual + * exclusion when examining the suitability of an AG in this + * loop, and it guards against two filestreams being established + * in the same AG as each other. + */ + if (xfs_filestream_get_ag(mp, ag) > 1) { + xfs_filestream_put_ag(mp, ag); + goto next_ag; + } + + longest = xfs_alloc_longest_free_extent(mp, pag); + if (((minlen && longest >= minlen) || + (!minlen && pag->pagf_freeblks >= minfree)) && + (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + + /* Break out, retaining the reference on the AG. */ + free = pag->pagf_freeblks; + xfs_perag_put(pag); + *agp = ag; + break; + } + + /* Drop the reference on this AG, it's not usable. */ + xfs_filestream_put_ag(mp, ag); +next_ag: + xfs_perag_put(pag); + /* Move to the next AG, wrapping to AG 0 if necessary. */ + if (++ag >= mp->m_sb.sb_agcount) + ag = 0; + + /* If a full pass of the AGs hasn't been done yet, continue. */ + if (ag != startag) + continue; + + /* Allow sleeping in xfs_alloc_pagf_init() on the 2nd pass. */ + if (trylock != 0) { + trylock = 0; + continue; + } + + /* Finally, if lowspace wasn't set, set it for the 3rd pass. */ + if (!(flags & XFS_PICK_LOWSPACE)) { + flags |= XFS_PICK_LOWSPACE; + continue; + } + + /* + * Take the AG with the most free space, regardless of whether + * it's already in use by another filestream. + */ + if (max_ag != NULLAGNUMBER) { + xfs_filestream_get_ag(mp, max_ag); + free = maxfree; + *agp = max_ag; + break; + } + + /* take AG 0 if none matched */ + trace_xfs_filestream_pick(ip, *agp, free, nscan); + *agp = 0; + return 0; + } + + trace_xfs_filestream_pick(ip, *agp, free, nscan); + + if (*agp == NULLAGNUMBER) + return 0; + + err = ENOMEM; + item = kmem_alloc(sizeof(*item), KM_MAYFAIL); + if (!item) + goto out_put_ag; + + item->ag = *agp; + item->ip = ip; + + err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); + if (err) { + if (err == EEXIST) + err = 0; + goto out_free_item; + } + + return 0; + +out_free_item: + kmem_free(item); +out_put_ag: + xfs_filestream_put_ag(mp, *agp); + return err; +} + +static struct xfs_inode * +xfs_filestream_get_parent( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip), *dir = NULL; + struct dentry *dentry, *parent; + + dentry = d_find_alias(inode); + if (!dentry) + goto out; + + parent = dget_parent(dentry); + if (!parent) + goto out_dput; + + dir = igrab(parent->d_inode); + dput(parent); + +out_dput: + dput(dentry); +out: + return dir ? XFS_I(dir) : NULL; +} + +/* + * Find the right allocation group for a file, either by finding an + * existing file stream or creating a new one. + * + * Returns NULLAGNUMBER in case of an error. + */ +xfs_agnumber_t +xfs_filestream_lookup_ag( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inode *pip = NULL; + xfs_agnumber_t startag, ag = NULLAGNUMBER; + struct xfs_mru_cache_elem *mru; + + ASSERT(S_ISREG(ip->i_d.di_mode)); + + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto out; + + mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); + if (mru) { + ag = container_of(mru, struct xfs_fstrm_item, mru)->ag; + xfs_mru_cache_done(mp->m_filestream); + + trace_xfs_filestream_lookup(ip, ag); + goto out; + } + + /* + * Set the starting AG using the rotor for inode32, otherwise + * use the directory inode's AG. + */ + if (mp->m_flags & XFS_MOUNT_32BITINODES) { + xfs_agnumber_t rotorstep = xfs_rotorstep; + startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; + mp->m_agfrotor = (mp->m_agfrotor + 1) % + (mp->m_sb.sb_agcount * rotorstep); + } else + startag = XFS_INO_TO_AGNO(mp, pip->i_ino); + + if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0)) + ag = NULLAGNUMBER; +out: + IRELE(pip); + return ag; +} + +/* + * Pick a new allocation group for the current file and its file stream. + * + * This is called when the allocator can't find a suitable extent in the + * current AG, and we have to move the stream into a new AG with more space. + */ +int +xfs_filestream_new_ag( + struct xfs_bmalloca *ap, + xfs_agnumber_t *agp) +{ + struct xfs_inode *ip = ap->ip, *pip; + struct xfs_mount *mp = ip->i_mount; + xfs_extlen_t minlen = ap->length; + xfs_agnumber_t startag = 0; + int flags, err = 0; + struct xfs_mru_cache_elem *mru; + + *agp = NULLAGNUMBER; + + pip = xfs_filestream_get_parent(ip); + if (!pip) + goto exit; + + mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino); + if (mru) { + struct xfs_fstrm_item *item = + container_of(mru, struct xfs_fstrm_item, mru); + startag = (item->ag + 1) % mp->m_sb.sb_agcount; + } + + flags = (ap->userdata ? XFS_PICK_USERDATA : 0) | + (ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0); + + err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen); + + /* + * Only free the item here so we skip over the old AG earlier. + */ + if (mru) + xfs_fstrm_free_func(mru); + + IRELE(pip); +exit: + if (*agp == NULLAGNUMBER) + *agp = 0; + return err; +} + +void +xfs_filestream_deassociate( + struct xfs_inode *ip) +{ + xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino); +} + +int +xfs_filestream_mount( + xfs_mount_t *mp) +{ + /* + * The filestream timer tunable is currently fixed within the range of + * one second to four minutes, with five seconds being the default. The + * group count is somewhat arbitrary, but it'd be nice to adhere to the + * timer tunable to within about 10 percent. This requires at least 10 + * groups. + */ + return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10, + 10, xfs_fstrm_free_func); +} + +void +xfs_filestream_unmount( + xfs_mount_t *mp) +{ + xfs_mru_cache_destroy(mp->m_filestream); +} diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h new file mode 100644 index 00000000000..2ef43406e53 --- /dev/null +++ b/fs/xfs/xfs_filestream.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FILESTREAM_H__ +#define __XFS_FILESTREAM_H__ + +struct xfs_mount; +struct xfs_inode; +struct xfs_bmalloca; + +int xfs_filestream_mount(struct xfs_mount *mp); +void xfs_filestream_unmount(struct xfs_mount *mp); +void xfs_filestream_deassociate(struct xfs_inode *ip); +xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip); +int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp); +int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno); + +static inline int +xfs_inode_is_filestream( + struct xfs_inode *ip) +{ + return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + (ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM); +} + +#endif /* __XFS_FILESTREAM_H__ */ diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h new file mode 100644 index 00000000000..34d85aca305 --- /dev/null +++ b/fs/xfs/xfs_format.h @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_FORMAT_H__ +#define __XFS_FORMAT_H__ + +/* + * XFS On Disk Format Definitions + * + * This header file defines all the on-disk format definitions for + * general XFS objects. Directory and attribute related objects are defined in + * xfs_da_format.h, which log and log item formats are defined in + * xfs_log_format.h. Everything else goes here. + */ + +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; +struct xfs_buf; +struct xfs_ifork; + +/* + * RealTime Device format definitions + */ + +/* Min and max rt extent sizes, specified in bytes */ +#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ +#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */ +#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */ + +#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) +#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) +#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) +#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) + +/* + * RT Summary and bit manipulation macros. + */ +#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) +#define XFS_SUMOFFSTOBLOCK(mp,s) \ + (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) +#define XFS_SUMPTR(mp,bp,so) \ + ((xfs_suminfo_t *)((bp)->b_addr + \ + (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) + +#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) +#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) +#define XFS_BITTOWORD(mp,bi) \ + ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) + +#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) +#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) + +#define XFS_RTLOBIT(w) xfs_lowbit32(w) +#define XFS_RTHIBIT(w) xfs_highbit32(w) + +#if XFS_BIG_BLKNOS +#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) +#else +#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) +#endif + +/* + * Dquot and dquot block format definitions + */ +#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ +#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ + +/* + * This is the main portion of the on-disk representation of quota + * information for a user. This is the q_core of the xfs_dquot_t that + * is kept in kernel memory. We pad this with some more expansion room + * to construct the on disk structure. + */ +typedef struct xfs_disk_dquot { + __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ + __u8 d_version; /* dquot version */ + __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ + __be32 d_id; /* user,project,group id */ + __be64 d_blk_hardlimit;/* absolute limit on disk blks */ + __be64 d_blk_softlimit;/* preferred limit on disk blks */ + __be64 d_ino_hardlimit;/* maximum # allocated inodes */ + __be64 d_ino_softlimit;/* preferred inode limit */ + __be64 d_bcount; /* disk blocks owned by the user */ + __be64 d_icount; /* inodes owned by the user */ + __be32 d_itimer; /* zero if within inode limits if not, + this is when we refuse service */ + __be32 d_btimer; /* similar to above; for disk blocks */ + __be16 d_iwarns; /* warnings issued wrt num inodes */ + __be16 d_bwarns; /* warnings issued wrt disk blocks */ + __be32 d_pad0; /* 64 bit align */ + __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ + __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ + __be64 d_rtbcount; /* realtime blocks owned */ + __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ + __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ + __be16 d_pad; +} xfs_disk_dquot_t; + +/* + * This is what goes on disk. This is separated from the xfs_disk_dquot because + * carrying the unnecessary padding would be a waste of memory. + */ +typedef struct xfs_dqblk { + xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ + char dd_fill[4]; /* filling for posterity */ + + /* + * These two are only present on filesystems with the CRC bits set. + */ + __be32 dd_crc; /* checksum */ + __be64 dd_lsn; /* last modification in log */ + uuid_t dd_uuid; /* location information */ +} xfs_dqblk_t; + +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + +/* + * Remote symlink format and access functions. + */ +#define XFS_SYMLINK_MAGIC 0x58534c4d /* XSLM */ + +struct xfs_dsymlink_hdr { + __be32 sl_magic; + __be32 sl_offset; + __be32 sl_bytes; + __be32 sl_crc; + uuid_t sl_uuid; + __be64 sl_owner; + __be64 sl_blkno; + __be64 sl_lsn; +}; + +#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc) + +/* + * The maximum pathlen is 1024 bytes. Since the minimum file system + * blocksize is 512 bytes, we can get a max of 3 extents back from + * bmapi when crc headers are taken into account. + */ +#define XFS_SYMLINK_MAPS 3 + +#define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ + ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + sizeof(struct xfs_dsymlink_hdr) : 0)) + + +/* + * Allocation Btree format definitions + * + * There are two on-disk btrees, one sorted by blockno and one sorted + * by blockcount and blockno. All blocks look the same to make the code + * simpler; if we have time later, we'll make the optimizations. + */ +#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */ +#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */ +#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */ +#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */ + +/* + * Data record/key structure + */ +typedef struct xfs_alloc_rec { + __be32 ar_startblock; /* starting block number */ + __be32 ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_t, xfs_alloc_key_t; + +typedef struct xfs_alloc_rec_incore { + xfs_agblock_t ar_startblock; /* starting block number */ + xfs_extlen_t ar_blockcount; /* count of free blocks */ +} xfs_alloc_rec_incore_t; + +/* btree pointer type */ +typedef __be32 xfs_alloc_ptr_t; + +/* + * Block numbers in the AG: + * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3. + */ +#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1)) +#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1)) + + +/* + * Inode Allocation Btree format definitions + * + * There is a btree for the inode map per allocation group. + */ +#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ +#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */ +#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */ +#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */ + +typedef __uint64_t xfs_inofree_t; +#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) +#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) +#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) +#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) + +static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) +{ + return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; +} + +/* + * Data record structure + */ +typedef struct xfs_inobt_rec { + __be32 ir_startino; /* starting inode number */ + __be32 ir_freecount; /* count of free inodes (set bits) */ + __be64 ir_free; /* free inode mask */ +} xfs_inobt_rec_t; + +typedef struct xfs_inobt_rec_incore { + xfs_agino_t ir_startino; /* starting inode number */ + __int32_t ir_freecount; /* count of free inodes (set bits) */ + xfs_inofree_t ir_free; /* free inode mask */ +} xfs_inobt_rec_incore_t; + + +/* + * Key structure + */ +typedef struct xfs_inobt_key { + __be32 ir_startino; /* starting inode number */ +} xfs_inobt_key_t; + +/* btree pointer type */ +typedef __be32 xfs_inobt_ptr_t; + +/* + * block numbers in the AG. + */ +#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) +#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) + +/* + * The first data block of an AG depends on whether the filesystem was formatted + * with the finobt feature. If so, account for the finobt reserved root btree + * block. + */ +#define XFS_PREALLOC_BLOCKS(mp) \ + (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + XFS_FIBT_BLOCK(mp) + 1 : \ + XFS_IBT_BLOCK(mp) + 1) + + + +/* + * BMAP Btree format definitions + * + * This includes both the root block definition that sits inside an inode fork + * and the record/pointer formats for the leaf/node in the blocks. + */ +#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */ +#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */ + +/* + * Bmap root header, on-disk form only. + */ +typedef struct xfs_bmdr_block { + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ +} xfs_bmdr_block_t; + +/* + * Bmap btree record and extent descriptor. + * l0:63 is an extent flag (value 1 indicates non-normal). + * l0:9-62 are startoff. + * l0:0-8 and l1:21-63 are startblock. + * l1:0-20 are blockcount. + */ +#define BMBT_EXNTFLAG_BITLEN 1 +#define BMBT_STARTOFF_BITLEN 54 +#define BMBT_STARTBLOCK_BITLEN 52 +#define BMBT_BLOCKCOUNT_BITLEN 21 + +typedef struct xfs_bmbt_rec { + __be64 l0, l1; +} xfs_bmbt_rec_t; + +typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */ +typedef xfs_bmbt_rec_t xfs_bmdr_rec_t; + +typedef struct xfs_bmbt_rec_host { + __uint64_t l0, l1; +} xfs_bmbt_rec_host_t; + +/* + * Values and macros for delayed-allocation startblock fields. + */ +#define STARTBLOCKVALBITS 17 +#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) +#define DSTARTBLOCKMASKBITS (15 + 20) +#define STARTBLOCKMASK \ + (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) +#define DSTARTBLOCKMASK \ + (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) + +static inline int isnullstartblock(xfs_fsblock_t x) +{ + return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; +} + +static inline int isnulldstartblock(xfs_dfsbno_t x) +{ + return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK; +} + +static inline xfs_fsblock_t nullstartblock(int k) +{ + ASSERT(k < (1 << STARTBLOCKVALBITS)); + return STARTBLOCKMASK | (k); +} + +static inline xfs_filblks_t startblockval(xfs_fsblock_t x) +{ + return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); +} + +/* + * Possible extent formats. + */ +typedef enum { + XFS_EXTFMT_NOSTATE = 0, + XFS_EXTFMT_HASSTATE +} xfs_exntfmt_t; + +/* + * Possible extent states. + */ +typedef enum { + XFS_EXT_NORM, XFS_EXT_UNWRITTEN, + XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID +} xfs_exntst_t; + +/* + * Incore version of above. + */ +typedef struct xfs_bmbt_irec +{ + xfs_fileoff_t br_startoff; /* starting file offset */ + xfs_fsblock_t br_startblock; /* starting block number */ + xfs_filblks_t br_blockcount; /* number of blocks */ + xfs_exntst_t br_state; /* extent state */ +} xfs_bmbt_irec_t; + +/* + * Key structure for non-leaf levels of the tree. + */ +typedef struct xfs_bmbt_key { + __be64 br_startoff; /* starting file offset */ +} xfs_bmbt_key_t, xfs_bmdr_key_t; + +/* btree pointer type */ +typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t; + + +/* + * Generic Btree block format definitions + * + * This is a combination of the actual format used on disk for short and long + * format btrees. The first three fields are shared by both format, but the + * pointers are different and should be used with care. + * + * To get the size of the actual short or long form headers please use the size + * macros below. Never use sizeof(xfs_btree_block). + * + * The blkno, crc, lsn, owner and uuid fields are only available in filesystems + * with the crc feature bit, and all accesses to them must be conditional on + * that flag. + */ +struct xfs_btree_block { + __be32 bb_magic; /* magic number for block type */ + __be16 bb_level; /* 0 is a leaf */ + __be16 bb_numrecs; /* current # of data records */ + union { + struct { + __be32 bb_leftsib; + __be32 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be32 bb_owner; + __le32 bb_crc; + } s; /* short form pointers */ + struct { + __be64 bb_leftsib; + __be64 bb_rightsib; + + __be64 bb_blkno; + __be64 bb_lsn; + uuid_t bb_uuid; + __be64 bb_owner; + __le32 bb_crc; + __be32 bb_pad; /* padding for alignment */ + } l; /* long form pointers */ + } bb_u; /* rest */ +}; + +#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ +#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ + +/* sizes of CRC enabled btree blocks */ +#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) +#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) + +#define XFS_BTREE_SBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.s.bb_crc) +#define XFS_BTREE_LBLOCK_CRC_OFF \ + offsetof(struct xfs_btree_block, bb_u.l.bb_crc) + +#endif /* __XFS_FORMAT_H__ */ diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 0f0ad153595..d34703dbcb4 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -22,8 +22,6 @@ * SGI's XFS filesystem's major stuff (constants, structures) */ -#define XFS_NAME "xfs" - /* * Direct I/O attribute record used with XFS_IOC_DIOINFO * d_miniosz is the min xfer size, xfer size multiple and file seek offset @@ -68,6 +66,7 @@ struct fsxattr { #define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ #define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ #define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ #define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ /* @@ -114,22 +113,16 @@ struct getbmapx { #define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */ #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ -#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC) -#ifdef __KERNEL__ -#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */ -#endif +#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ +#define BMV_IF_VALID \ + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) -/* bmv_oflags values - returned for for each non-header segment */ +/* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ - -/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */ -#define GETBMAP_CONVERT(p1,p2) { \ - p2.bmv_offset = p1.bmv_offset; \ - p2.bmv_block = p1.bmv_block; \ - p2.bmv_length = p1.bmv_length; \ - p2.bmv_count = p1.bmv_count; \ - p2.bmv_entries = p1.bmv_entries; } - +#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */ +#define BMV_OF_LAST 0x4 /* segment is the last in the file */ /* * Structure for XFS_IOC_FSSETDM. @@ -240,16 +233,31 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */ #define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */ #define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */ - +#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ +#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ +#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ +#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ +#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ /* - * Minimum and maximum sizes need for growth checks + * Minimum and maximum sizes need for growth checks. + * + * Block counts are in units of filesystem blocks, not basic blocks. */ #define XFS_MIN_AG_BLOCKS 64 -#define XFS_MIN_LOG_BLOCKS 512 -#define XFS_MAX_LOG_BLOCKS (64 * 1024) -#define XFS_MIN_LOG_BYTES (256 * 1024) -#define XFS_MAX_LOG_BYTES (128 * 1024 * 1024) +#define XFS_MIN_LOG_BLOCKS 512ULL +#define XFS_MAX_LOG_BLOCKS (1024 * 1024ULL) +#define XFS_MIN_LOG_BYTES (10 * 1024 * 1024ULL) + +/* keep the maximum size under 2^31 by a small amount */ +#define XFS_MAX_LOG_BYTES \ + ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) + +/* Used for sanity checks on superblock */ +#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks) +#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \ + (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) /* * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT @@ -295,14 +303,28 @@ typedef struct xfs_bstat { __s32 bs_extsize; /* extent size */ __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ - __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ + __u16 bs_projid_hi; /* higher part of project id */ + unsigned char bs_pad[10]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ } xfs_bstat_t; /* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was choosen + * to retain compatibility with "old" filesystems). + */ +static inline __uint32_t +bstat_get_projid(struct xfs_bstat *bs) +{ + return (__uint32_t)bs->bs_projid_hi << 16 | bs->bs_projid_lo; +} + +/* * The user-level BulkStat Request interface structure. */ typedef struct xfs_fsop_bulkreq { @@ -333,6 +355,35 @@ typedef struct xfs_error_injection { /* + * Speculative preallocation trimming. + */ +#define XFS_EOFBLOCKS_VERSION 1 +struct xfs_fs_eofblocks { + __u32 eof_version; + __u32 eof_flags; + uid_t eof_uid; + gid_t eof_gid; + prid_t eof_prid; + __u32 pad32; + __u64 eof_min_file_size; + __u64 pad64[12]; +}; + +/* eof_flags values */ +#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */ +#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */ +#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ +#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ +#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ +#define XFS_EOF_FLAGS_VALID \ + (XFS_EOF_FLAGS_SYNC | \ + XFS_EOF_FLAGS_UID | \ + XFS_EOF_FLAGS_GID | \ + XFS_EOF_FLAGS_PRID | \ + XFS_EOF_FLAGS_MINFILESIZE) + + +/* * The user-level Handle Request interface structure. */ typedef struct xfs_fsop_handlereq { @@ -371,6 +422,9 @@ typedef struct xfs_fsop_attrlist_handlereq { typedef struct xfs_attr_multiop { __u32 am_opcode; +#define ATTR_OP_GET 1 /* return the indicated attr's value */ +#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */ +#define ATTR_OP_REMOVE 3 /* remove the indicated attr */ __s32 am_error; void __user *am_attrname; void __user *am_attrvalue; @@ -389,30 +443,13 @@ typedef struct xfs_fsop_attrmulti_handlereq { */ typedef struct { __u32 val[2]; } xfs_fsid_t; /* file system id type */ - -#ifndef HAVE_FID -#define MAXFIDSZ 46 - -typedef struct fid { - __u16 fid_len; /* length of data in bytes */ - unsigned char fid_data[MAXFIDSZ]; /* data (fid_len worth) */ -} fid_t; -#endif - typedef struct xfs_fid { - __u16 xfs_fid_len; /* length of remainder */ - __u16 xfs_fid_pad; - __u32 xfs_fid_gen; /* generation number */ - __u64 xfs_fid_ino; /* 64 bits inode number */ + __u16 fid_len; /* length of remainder */ + __u16 fid_pad; + __u32 fid_gen; /* generation number */ + __u64 fid_ino; /* 64 bits inode number */ } xfs_fid_t; -typedef struct xfs_fid2 { - __u16 fid_len; /* length of remainder */ - __u16 fid_pad; /* padding, must be zero */ - __u32 fid_gen; /* generation number */ - __u64 fid_ino; /* inode number */ -} xfs_fid2_t; - typedef struct xfs_handle { union { __s64 align; /* force alignment of ha_fid */ @@ -422,15 +459,26 @@ typedef struct xfs_handle { } xfs_handle_t; #define ha_fsid ha_u._ha_fsid -#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.xfs_fid_pad \ +#define XFS_HSIZE(handle) (((char *) &(handle).ha_fid.fid_pad \ - (char *) &(handle)) \ - + (handle).ha_fid.xfs_fid_len) + + (handle).ha_fid.fid_len) -#define XFS_HANDLE_CMP(h1, h2) memcmp(h1, h2, sizeof(xfs_handle_t)) - -#define FSHSIZE sizeof(fsid_t) +/* + * Structure passed to XFS_IOC_SWAPEXT + */ +typedef struct xfs_swapext +{ + __int64_t sx_version; /* version */ +#define XFS_SX_VERSION 0 + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} xfs_swapext_t; -/* +/* * Flags for going down operation */ #define XFS_FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -440,9 +488,9 @@ typedef struct xfs_handle { /* * ioctl commands that are used by Linux filesystems */ -#define XFS_IOC_GETXFLAGS _IOR('f', 1, long) -#define XFS_IOC_SETXFLAGS _IOW('f', 2, long) -#define XFS_IOC_GETVERSION _IOR('v', 1, long) +#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS +#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS +#define XFS_IOC_GETVERSION FS_IOC_GETVERSION /* * ioctl commands that replace IRIX fcntl()'s @@ -467,6 +515,8 @@ typedef struct xfs_handle { /* XFS_IOC_SETBIOSIZE ---- deprecated 46 */ /* XFS_IOC_GETBIOSIZE ---- deprecated 47 */ #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) +#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) +#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) /* * ioctl commands that replace IRIX syssgi()'s @@ -490,8 +540,14 @@ typedef struct xfs_handle { #define XFS_IOC_ERROR_INJECTION _IOW ('X', 116, struct xfs_error_injection) #define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) /* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ + +/* XFS_IOC_FREEZE -- FIFREEZE 119 */ +/* XFS_IOC_THAW -- FITHAW 120 */ +#ifndef FIFREEZE #define XFS_IOC_FREEZE _IOWR('X', 119, int) #define XFS_IOC_THAW _IOWR('X', 120, int) +#endif + #define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) #define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) #define XFS_IOC_ATTRMULTI_BY_HANDLE _IOW ('X', 123, struct xfs_fsop_attrmulti_handlereq) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index c064e72ada9..d2295561570 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -17,33 +17,31 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_inum.h" -#include "xfs_log.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" #include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_fsops.h" #include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" -#include "xfs_rw.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" /* * File system operations @@ -55,6 +53,9 @@ xfs_fs_geometry( xfs_fsop_geom_t *geo, int new_version) { + + memset(geo, 0, sizeof(*geo)); + geo->blocksize = mp->m_sb.sb_blocksize; geo->rtextsize = mp->m_sb.sb_rextsize; geo->agblocks = mp->m_sb.sb_agblocks; @@ -75,57 +76,86 @@ xfs_fs_geometry( } if (new_version >= 3) { geo->version = XFS_FSOP_GEOM_VERSION; - geo->flags = - (XFS_SB_VERSION_HASATTR(&mp->m_sb) ? + geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | + XFS_FSOP_GEOM_FLAGS_DIRV2 | + (xfs_sb_version_hasattr(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR : 0) | - (XFS_SB_VERSION_HASNLINK(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_NLINK : 0) | - (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) ? + (xfs_sb_version_hasquota(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_QUOTA : 0) | - (XFS_SB_VERSION_HASALIGN(&mp->m_sb) ? + (xfs_sb_version_hasalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_IALIGN : 0) | - (XFS_SB_VERSION_HASDALIGN(&mp->m_sb) ? + (xfs_sb_version_hasdalign(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_DALIGN : 0) | - (XFS_SB_VERSION_HASSHARED(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_SHARED : 0) | - (XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb) ? + (xfs_sb_version_hasextflgbit(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) | - (XFS_SB_VERSION_HASDIRV2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) | - (XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + (xfs_sb_version_hassector(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_SECTOR : 0) | - (XFS_SB_VERSION_HASATTR2(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_ATTR2 : 0); - geo->logsectsize = XFS_SB_VERSION_HASSECTOR(&mp->m_sb) ? + (xfs_sb_version_hasasciici(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) | + (xfs_sb_version_haslazysbcount(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) | + (xfs_sb_version_hasattr2(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | + (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0) | + (xfs_sb_version_hasftype(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | + (xfs_sb_version_hasfinobt(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; - geo->dirblocksize = mp->m_dirblksize; + geo->dirblocksize = mp->m_dir_geo->blksize; } if (new_version >= 4) { geo->flags |= - (XFS_SB_VERSION_HASLOGV2(&mp->m_sb) ? + (xfs_sb_version_haslogv2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_LOGV2 : 0); geo->logsunit = mp->m_sb.sb_logsunit; } return 0; } +static struct xfs_buf * +xfs_growfs_get_hdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { xfs_agf_t *agf; + struct xfs_agfl *agfl; xfs_agi_t *agi; xfs_agnumber_t agno; xfs_extlen_t agsize; xfs_extlen_t tmpsize; xfs_alloc_rec_t *arec; - xfs_btree_sblock_t *block; xfs_buf_t *bp; int bucket; int dpct; - int error; + int error, saved_error = 0; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; @@ -133,20 +163,25 @@ xfs_growfs_data_private( xfs_rfsblock_t nfree; xfs_agnumber_t oagcount; int pct; - xfs_sb_t *sbp; xfs_trans_t *tp; nb = in->newblocks; pct = in->imaxpct; if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; dpct = pct - mp->m_sb.sb_imax_pct; - error = xfs_read_buf(mp, mp->m_ddev_targp, - XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp); - if (error) + bp = xfs_buf_read_uncached(mp->m_ddev_targp, + XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, NULL); + if (!bp) + return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); return error; - ASSERT(bp); + } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ @@ -154,41 +189,51 @@ xfs_growfs_data_private( nagcount = new + (nb_mod != 0); if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { nagcount--; - nb = nagcount * mp->m_sb.sb_agblocks; + nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; if (nb < mp->m_sb.sb_dblocks) return XFS_ERROR(EINVAL); } new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; + + /* allocate the new per-ag structures */ if (nagcount > oagcount) { - down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); - mp->m_flags |= XFS_MOUNT_32BITINODES; - nagimax = xfs_initialize_perag(XFS_MTOVFS(mp), mp, nagcount); - up_write(&mp->m_peraglock); + error = xfs_initialize_perag(mp, nagcount, &nagimax); + if (error) + return error; } + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); - if ((error = xfs_trans_reserve(tp, XFS_GROWFS_SPACE_RES(mp), - XFS_GROWDATA_LOG_RES(mp), 0, 0, 0))) { + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0); + if (error) { xfs_trans_cancel(tp, 0); return error; } + /* + * Write new AG headers to disk. Non-transactional, but written + * synchronously so they are completed prior to the growfs transaction + * being logged. + */ nfree = 0; for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { + __be32 *agfl_bno; + /* - * AG freelist header block + * AG freespace header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agf_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + agf = XFS_BUF_TO_AGF(bp); - memset(agf, 0, mp->m_sb.sb_sectsize); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); agf->agf_seqno = cpu_to_be32(agno); @@ -209,18 +254,55 @@ xfs_growfs_data_private( tmpsize = agsize - XFS_PREALLOC_BLOCKS(mp); agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - error = xfs_bwrite(mp, bp); - if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_uuid); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * AG freelist header block + */ + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agfl_buf_ops); + if (!bp) { + error = ENOMEM; goto error0; } + + agfl = XFS_BUF_TO_AGFL(bp); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + /* * AG inode header block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), + XFS_FSS_TO_BB(mp, 1), 0, + &xfs_agi_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + agi = XFS_BUF_TO_AGI(bp); - memset(agi, 0, mp->m_sb.sb_sectsize); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); agi->agi_seqno = cpu_to_be32(agno); @@ -231,74 +313,131 @@ xfs_growfs_data_private( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - error = xfs_bwrite(mp, bp); - if (error) { + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * BNO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); - error = xfs_bwrite(mp, bp); - if (error) { + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * CNT btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC); - block->bb_level = 0; - block->bb_numrecs = cpu_to_be16(1); - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - arec = XFS_BTREE_REC_ADDR(mp->m_sb.sb_blocksize, xfs_alloc, - block, 1, mp->m_alloc_mxr[0]); + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_allocbt_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, + agno, 0); + + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp)); arec->ar_blockcount = cpu_to_be32( agsize - be32_to_cpu(arec->ar_startblock)); nfree += be32_to_cpu(arec->ar_blockcount); - error = xfs_bwrite(mp, bp); - if (error) { + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) goto error0; - } + /* * INO btree root block */ - bp = xfs_buf_get(mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0); - block = XFS_BUF_TO_SBLOCK(bp); - memset(block, 0, mp->m_sb.sb_blocksize); - block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC); - block->bb_level = 0; - block->bb_numrecs = 0; - block->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - block->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - error = xfs_bwrite(mp, bp); - if (error) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = ENOMEM; goto error0; } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0, + agno, XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, + agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + + /* + * FINO btree root block + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + bp = xfs_growfs_get_hdr_buf(mp, + XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), + BTOBB(mp->m_sb.sb_blocksize), 0, + &xfs_inobt_buf_ops); + if (!bp) { + error = ENOMEM; + goto error0; + } + + if (xfs_sb_version_hascrc(&mp->m_sb)) + xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC, + 0, 0, agno, + XFS_BTREE_CRC_BLOCKS); + else + xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0, + 0, agno, 0); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + goto error0; + } + } xfs_trans_agblocks_delta(tp, nfree); /* @@ -314,7 +453,7 @@ xfs_growfs_data_private( } ASSERT(bp); agi = XFS_BUF_TO_AGI(bp); - be32_add(&agi->agi_length, new); + be32_add_cpu(&agi->agi_length, new); ASSERT(nagcount == oagcount || be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); @@ -327,9 +466,11 @@ xfs_growfs_data_private( } ASSERT(bp); agf = XFS_BUF_TO_AGF(bp); - be32_add(&agf->agf_length, new); + be32_add_cpu(&agf->agf_length, new); ASSERT(be32_to_cpu(agf->agf_length) == be32_to_cpu(agi->agi_length)); + + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); /* * Free the new space. */ @@ -339,6 +480,12 @@ xfs_growfs_data_private( goto error0; } } + + /* + * Update changed superblock fields transactionally. These are not + * seen by the rest of the world until the transaction commit applies + * them atomically to the superblock. + */ if (nagcount > oagcount) xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount); if (nb > mp->m_sb.sb_dblocks) @@ -348,10 +495,10 @@ xfs_growfs_data_private( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); - error = xfs_trans_commit(tp, 0, NULL); - if (error) { + error = xfs_trans_commit(tp, 0); + if (error) return error; - } + /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; @@ -361,33 +508,59 @@ xfs_growfs_data_private( mp->m_maxicount = icount << mp->m_sb.sb_inopblog; } else mp->m_maxicount = 0; + xfs_set_low_space_thresholds(mp); + + /* update secondary superblocks. */ for (agno = 1; agno < nagcount; agno++) { - error = xfs_read_buf(mp, mp->m_ddev_targp, + error = 0; + /* + * new secondary superblocks need to be zeroed, not read from + * disk as the contents of the new area we are growing into is + * completely unknown. + */ + if (agno < oagcount) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); - if (error) { - xfs_fs_cmn_err(CE_WARN, mp, - "error %d reading secondary superblock for ag %d", - error, agno); - break; + XFS_FSS_TO_BB(mp, 1), 0, &bp, + &xfs_sb_buf_ops); + } else { + bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, + XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (bp) { + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + } else + error = ENOMEM; } - sbp = XFS_BUF_TO_SBP(bp); - xfs_xlatesb(sbp, &mp->m_sb, -1, XFS_SB_ALL_BITS); + /* - * If we get an error writing out the alternate superblocks, - * just issue a warning and continue. The real work is - * already done and committed. + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. */ - if (!(error = xfs_bwrite(mp, bp))) { + if (error) { + xfs_warn(mp, + "error %d reading secondary superblock for ag %d", + error, agno); + saved_error = error; continue; - } else { - xfs_fs_cmn_err(CE_WARN, mp, + } + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) { + xfs_warn(mp, "write error %d updating secondary superblock for ag %d", error, agno); - break; /* no point in continuing */ + saved_error = error; + continue; } } - return 0; + return saved_error ? saved_error : error; error0: xfs_trans_cancel(tp, XFS_TRANS_ABORT); @@ -429,10 +602,13 @@ xfs_growfs_data( xfs_growfs_data_t *in) { int error; - if (!cpsema(&mp->m_growlock)) + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_data_private(mp, in); - vsema(&mp->m_growlock); + mutex_unlock(&mp->m_growlock); return error; } @@ -442,10 +618,13 @@ xfs_growfs_log( xfs_growfs_log_t *in) { int error; - if (!cpsema(&mp->m_growlock)) + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + if (!mutex_trylock(&mp->m_growlock)) return XFS_ERROR(EWOULDBLOCK); error = xfs_growfs_log_private(mp, in); - vsema(&mp->m_growlock); + mutex_unlock(&mp->m_growlock); return error; } @@ -458,15 +637,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - unsigned long s; - - xfs_icsb_sync_counters_lazy(mp); - s = XFS_SB_LOCK(mp); + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + spin_lock(&mp->m_sb_lock); cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; cnt->freeino = mp->m_sb.sb_ifree; cnt->allocino = mp->m_sb.sb_icount; - XFS_SB_UNLOCK(mp, s); + spin_unlock(&mp->m_sb_lock); return 0; } @@ -491,30 +668,48 @@ xfs_reserve_blocks( __uint64_t *inval, xfs_fsop_resblks_t *outval) { - __int64_t lcounter, delta; + __int64_t lcounter, delta, fdblks_delta; __uint64_t request; - unsigned long s; /* If inval is null, report current values and return */ - if (inval == (__uint64_t *)NULL) { + if (!outval) + return EINVAL; outval->resblks = mp->m_resblks; outval->resblks_avail = mp->m_resblks_avail; return 0; } request = *inval; - s = XFS_SB_LOCK(mp); + + /* + * With per-cpu counters, this becomes an interesting + * problem. we needto work out if we are freeing or allocation + * blocks first, then we can do the modification as necessary. + * + * We do this under the m_sb_lock so that if we are near + * ENOSPC, we will hold out any changes while we work out + * what to do. This means that the amount of free space can + * change while we do this, so we need to retry if we end up + * trying to reserve more space than is available. + * + * We also use the xfs_mod_incore_sb() interface so that we + * don't have to care about whether per cpu counter are + * enabled, disabled or even compiled in.... + */ +retry: + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, * then move any unused blocks back to the free pool. */ - + fdblks_delta = 0; if (mp->m_resblks > request) { lcounter = mp->m_resblks_avail - request; if (lcounter > 0) { /* release unused blocks */ - mp->m_sb.sb_fdblocks += lcounter; + fdblks_delta = lcounter; mp->m_resblks_avail -= lcounter; } mp->m_resblks = request; @@ -522,50 +717,81 @@ xfs_reserve_blocks( __int64_t free; free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + if (!free) + goto out; /* ENOSPC and fdblks_delta = 0 */ + delta = request - mp->m_resblks; lcounter = free - delta; if (lcounter < 0) { /* We can't satisfy the request, just get what we can */ mp->m_resblks += free; mp->m_resblks_avail += free; - mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp); + fdblks_delta = -free; } else { - mp->m_sb.sb_fdblocks = - lcounter + XFS_ALLOC_SET_ASIDE(mp); + fdblks_delta = -delta; mp->m_resblks = request; mp->m_resblks_avail += delta; } } +out: + if (outval) { + outval->resblks = mp->m_resblks; + outval->resblks_avail = mp->m_resblks_avail; + } + spin_unlock(&mp->m_sb_lock); - outval->resblks = mp->m_resblks; - outval->resblks_avail = mp->m_resblks_avail; - XFS_SB_UNLOCK(mp, s); + if (fdblks_delta) { + /* + * If we are putting blocks back here, m_resblks_avail is + * already at its max so this will put it in the free pool. + * + * If we need space, we'll either succeed in getting it + * from the free block count or we'll get an enospc. If + * we get a ENOSPC, it means things changed while we were + * calculating fdblks_delta and so we should try again to + * see if there is anything left to reserve. + * + * Don't set the reserved flag here - we don't want to reserve + * the extra reserve blocks from the reserve..... + */ + int error; + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + fdblks_delta, 0); + if (error == ENOSPC) + goto retry; + } return 0; } -void +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead and use a + * synchronous transaction to ensure the superblock is immediately unpinned + * and can be written back. + */ +int xfs_fs_log_dummy( xfs_mount_t *mp) { xfs_trans_t *tp; - xfs_inode_t *ip; + int error; - tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if (xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0)) { + tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* log the UUID because it is an unchanging field */ + xfs_mod_sb(tp, XFS_SB_UUID); xfs_trans_set_sync(tp); - xfs_trans_commit(tp, 0, NULL); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_trans_commit(tp, 0); } int @@ -575,14 +801,13 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - struct super_block *sb = freeze_bdev(vfsp->vfs_super->s_bdev); + struct super_block *sb = freeze_bdev(mp->m_super->s_bdev); if (sb && !IS_ERR(sb)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); thaw_bdev(sb->s_bdev, sb); } - + break; } case XFS_FSOP_GOING_FLAGS_LOGFLUSH: @@ -598,3 +823,63 @@ xfs_fs_goingdown( return 0; } + +/* + * Force a shutdown of the filesystem instantly while keeping the filesystem + * consistent. We don't do an unmount here; just shutdown the shop, make sure + * that absolutely nothing persistent happens to this filesystem after this + * point. + */ +void +xfs_do_force_shutdown( + xfs_mount_t *mp, + int flags, + char *fname, + int lnnum) +{ + int logerror; + + logerror = flags & SHUTDOWN_LOG_IO_ERROR; + + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_notice(mp, + "%s(0x%x) called from line %d of file %s. Return address = 0x%p", + __func__, flags, lnnum, fname, __return_address); + } + /* + * No need to duplicate efforts. + */ + if (XFS_FORCED_SHUTDOWN(mp) && !logerror) + return; + + /* + * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't + * queue up anybody new on the log reservations, and wakes up + * everybody who's sleeping on log reservations to tell them + * the bad news. + */ + if (xfs_log_force_umount(mp, logerror)) + return; + + if (flags & SHUTDOWN_CORRUPT_INCORE) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, + "Corruption of in-memory data detected. Shutting down filesystem"); + if (XFS_ERRLEVEL_HIGH <= xfs_error_level) + xfs_stack_trace(); + } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + if (logerror) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, + "Log I/O Error Detected. Shutting down filesystem"); + } else if (flags & SHUTDOWN_DEVICE_REQ) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "All device paths lost. Shutting down filesystem"); + } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { + xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, + "I/O Error Detected. Shutting down filesystem"); + } + } + if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { + xfs_alert(mp, + "Please umount the filesystem and rectify the problem(s)"); + } +} diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 300d0c9d61a..1b6a98b6688 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern void xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(struct xfs_mount *mp); #endif /* __XFS_FSOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/xfs_globals.c index 6c162c3dde7..5399ef222dd 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -16,25 +16,19 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_cred.h" #include "xfs_sysctl.h" /* - * System memory size - used to scale certain data structures in XFS. - */ -unsigned long xfs_physmem; - -/* * Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n, * other XFS code uses these values. Times are measured in centisecs (i.e. - * 100ths of a second). + * 100ths of a second) with the exception of eofb_timer, which is measured in + * seconds. */ xfs_param_t xfs_params = { /* MIN DFLT MAX */ - .restrict_chown = { 0, 1, 1 }, .sgid_inherit = { 0, 0, 1 }, .symlink_mode = { 0, 0, 1 }, - .panic_mask = { 0, 0, 127 }, + .panic_mask = { 0, 0, 255 }, .error_level = { 0, 3, 11 }, .syncd_timer = { 1*100, 30*100, 7200*100}, .stats_clear = { 0, 0, 1 }, @@ -46,10 +40,6 @@ xfs_param_t xfs_params = { .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, + .fstrm_timer = { 1, 30*100, 3600*100}, + .eofb_timer = { 1, 300, 3600*24}, }; - -/* - * Global system credential structure. - */ -cred_t sys_cred_val, *sys_cred = &sys_cred_val; - diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 33164a85aa9..5960e5593fe 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -17,96 +17,337 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_error.h" #include "xfs_bmap.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_icreate_item.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_trace.h" + /* - * Log specified fields for the inode given by bp and off. + * Allocation group level functions. */ -STATIC void -xfs_ialloc_log_di( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* inode buffer */ - int off, /* index of inode in buffer */ - int fields) /* bitmask of fields to log */ +static inline int +xfs_ialloc_cluster_alignment( + xfs_alloc_arg_t *args) { - int first; /* first byte number */ - int ioffset; /* off in bytes */ - int last; /* last byte number */ - xfs_mount_t *mp; /* mount point structure */ - static const short offsets[] = { /* field offsets */ - /* keep in sync with bits */ - offsetof(xfs_dinode_core_t, di_magic), - offsetof(xfs_dinode_core_t, di_mode), - offsetof(xfs_dinode_core_t, di_version), - offsetof(xfs_dinode_core_t, di_format), - offsetof(xfs_dinode_core_t, di_onlink), - offsetof(xfs_dinode_core_t, di_uid), - offsetof(xfs_dinode_core_t, di_gid), - offsetof(xfs_dinode_core_t, di_nlink), - offsetof(xfs_dinode_core_t, di_projid), - offsetof(xfs_dinode_core_t, di_pad), - offsetof(xfs_dinode_core_t, di_atime), - offsetof(xfs_dinode_core_t, di_mtime), - offsetof(xfs_dinode_core_t, di_ctime), - offsetof(xfs_dinode_core_t, di_size), - offsetof(xfs_dinode_core_t, di_nblocks), - offsetof(xfs_dinode_core_t, di_extsize), - offsetof(xfs_dinode_core_t, di_nextents), - offsetof(xfs_dinode_core_t, di_anextents), - offsetof(xfs_dinode_core_t, di_forkoff), - offsetof(xfs_dinode_core_t, di_aformat), - offsetof(xfs_dinode_core_t, di_dmevmask), - offsetof(xfs_dinode_core_t, di_dmstate), - offsetof(xfs_dinode_core_t, di_flags), - offsetof(xfs_dinode_core_t, di_gen), - offsetof(xfs_dinode_t, di_next_unlinked), - offsetof(xfs_dinode_t, di_u), - offsetof(xfs_dinode_t, di_a), - sizeof(xfs_dinode_t) - }; + if (xfs_sb_version_hasalign(&args->mp->m_sb) && + args->mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size)) + return args->mp->m_sb.sb_inoalignmt; + return 1; +} +/* + * Lookup a record by ino in the btree given by cur. + */ +int /* error */ +xfs_inobt_lookup( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_agino_t ino, /* starting inode of chunk */ + xfs_lookup_t dir, /* <=, >=, == */ + int *stat) /* success/failure */ +{ + cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_freecount = 0; + cur->bc_rec.i.ir_free = 0; + return xfs_btree_lookup(cur, dir, stat); +} + +/* + * Update the record referred to by cur to the value given. + * This either works (return 0) or gets an EFSCORRUPTED error. + */ +STATIC int /* error */ +xfs_inobt_update( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec) /* btree record */ +{ + union xfs_btree_rec rec; + + rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); + rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + rec.inobt.ir_free = cpu_to_be64(irec->ir_free); + return xfs_btree_update(cur, &rec); +} + +/* + * Get the data from the pointed-to record. + */ +int /* error */ +xfs_inobt_get_rec( + struct xfs_btree_cur *cur, /* btree cursor */ + xfs_inobt_rec_incore_t *irec, /* btree record */ + int *stat) /* output: success/failure */ +{ + union xfs_btree_rec *rec; + int error; + + error = xfs_btree_get_rec(cur, &rec, stat); + if (!error && *stat == 1) { + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + } + return error; +} + +/* + * Insert a single inobt record. Cursor must already point to desired location. + */ +STATIC int +xfs_inobt_insert_rec( + struct xfs_btree_cur *cur, + __int32_t freecount, + xfs_inofree_t free, + int *stat) +{ + cur->bc_rec.i.ir_freecount = freecount; + cur->bc_rec.i.ir_free = free; + return xfs_btree_insert(cur, stat); +} + +/* + * Insert records describing a newly allocated inode chunk into the inobt. + */ +STATIC int +xfs_inobt_insert( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t newino, + xfs_agino_t newlen, + xfs_btnum_t btnum) +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agino_t thisino; + int i; + int error; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + for (thisino = newino; + thisino < newino + newlen; + thisino += XFS_INODES_PER_CHUNK) { + error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 0); + + error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + XFS_INOBT_ALL_FREE, &i); + if (error) { + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; + } + ASSERT(i == 1); + } + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + return 0; +} + +/* + * Verify that the number of free inodes in the AGI is correct. + */ +#ifdef DEBUG +STATIC int +xfs_check_agi_freecount( + struct xfs_btree_cur *cur, + struct xfs_agi *agi) +{ + if (cur->bc_nlevels == 1) { + xfs_inobt_rec_incore_t rec; + int freecount = 0; + int error; + int i; + + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + + do { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + + if (i) { + freecount += rec.ir_freecount; + error = xfs_btree_increment(cur, 0, &i); + if (error) + return error; + } + } while (i == 1); + + if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + ASSERT(freecount == be32_to_cpu(agi->agi_freecount)); + } + return 0; +} +#else +#define xfs_check_agi_freecount(cur, agi) 0 +#endif + +/* + * Initialise a new set of inodes. When called without a transaction context + * (e.g. from recovery) we initiate a delayed write of the inode buffers rather + * than logging them (which in a transaction context puts them into the AIL + * for writeback rather than the xfsbufd queue). + */ +int +xfs_ialloc_inode_init( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_agblock_t length, + unsigned int gen) +{ + struct xfs_buf *fbuf; + struct xfs_dinode *free; + int nbufs, blks_per_cluster, inodes_per_cluster; + int version; + int i, j; + xfs_daddr_t d; + xfs_ino_t ino = 0; - ASSERT(offsetof(xfs_dinode_t, di_core) == 0); - ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0); - mp = tp->t_mountp; /* - * Get the inode-relative first and last bytes for these fields + * Loop over the new block(s), filling in the inodes. For small block + * sizes, manipulate the inodes in buffers which are multiples of the + * blocks size. */ - xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = length / blks_per_cluster; + /* - * Convert to buffer offsets and log it. + * Figure out what version number to use in the inodes we create. If + * the superblock version has caught up to the one that supports the new + * inode format, then use the new inode version. Otherwise use the old + * version so that old kernels will continue to be able to use the file + * system. + * + * For v3 inodes, we also need to write the inode number into the inode, + * so calculate the first inode number of the chunk here as + * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not + * across multiple filesystem blocks (such as a cluster) and so cannot + * be used in the cluster buffer loop below. + * + * Further, because we are writing the inode directly into the buffer + * and calculating a CRC on the entire inode, we have ot log the entire + * inode so that the entire range the CRC covers is present in the log. + * That means for v3 inode we log the entire buffer rather than just the + * inode cores. */ - ioffset = off << mp->m_sb.sb_inodelog; - first += ioffset; - last += ioffset; - xfs_trans_log_buf(tp, bp, first, last); -} + if (xfs_sb_version_hascrc(&mp->m_sb)) { + version = 3; + ino = XFS_AGINO_TO_INO(mp, agno, + XFS_OFFBNO_TO_AGINO(mp, agbno, 0)); -/* - * Allocation group level functions. - */ + /* + * log the initialisation that is about to take place as an + * logical operation. This means the transaction does not + * need to log the physical changes to the inode buffers as log + * recovery will know what initialisation is actually needed. + * Hence we only need to log the buffers as "ordered" buffers so + * they track in the AIL as if they were physically logged. + */ + if (tp) + xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + mp->m_sb.sb_inodesize, length, gen); + } else + version = 2; + + for (j = 0; j < nbufs; j++) { + /* + * Get the block. + */ + d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster)); + fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + if (!fbuf) + return ENOMEM; + + /* Initialize the inode buffers and log them appropriately. */ + fbuf->b_ops = &xfs_inode_buf_ops; + xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); + for (i = 0; i < inodes_per_cluster; i++) { + int ioffset = i << mp->m_sb.sb_inodelog; + uint isize = xfs_dinode_size(version); + + free = xfs_make_iptr(mp, fbuf, i); + free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); + free->di_version = version; + free->di_gen = cpu_to_be32(gen); + free->di_next_unlinked = cpu_to_be32(NULLAGINO); + + if (version == 3) { + free->di_ino = cpu_to_be64(ino); + ino++; + uuid_copy(&free->di_uuid, &mp->m_sb.sb_uuid); + xfs_dinode_calc_crc(mp, free); + } else if (tp) { + /* just log the inode core */ + xfs_trans_log_buf(tp, fbuf, ioffset, + ioffset + isize - 1); + } + } + + if (tp) { + /* + * Mark the buffer as an inode allocation buffer so it + * sticks in AIL at the point of this allocation + * transaction. This ensures the they are on disk before + * the tail of the log can be moved past this + * transaction (i.e. by preventing relogging from moving + * it forward in the log). + */ + xfs_trans_inode_alloc_buf(tp, fbuf); + if (version == 3) { + /* + * Mark the buffer as ordered so that they are + * not physically logged in the transaction but + * still tracked in the AIL as part of the + * transaction and pin the log appropriately. + */ + xfs_trans_ordered_buf(tp, fbuf); + xfs_trans_log_buf(tp, fbuf, 0, + BBTOB(fbuf->b_length) - 1); + } + } else { + fbuf->b_flags |= XBF_DONE; + xfs_buf_delwri_queue(fbuf, buffer_list); + xfs_buf_relse(fbuf); + } + } + return 0; +} /* * Allocate new inodes in the allocation group specified by agbp. @@ -120,23 +361,15 @@ xfs_ialloc_ag_alloc( { xfs_agi_t *agi; /* allocation group header */ xfs_alloc_arg_t args; /* allocation argument structure */ - int blks_per_cluster; /* fs blocks per inode cluster */ - xfs_btree_cur_t *cur; /* inode btree cursor */ - xfs_daddr_t d; /* disk addr of buffer */ + xfs_agnumber_t agno; int error; - xfs_buf_t *fbuf; /* new free inodes' buffer */ - xfs_dinode_t *free; /* new free inode structure */ - int i; /* inode counter */ - int j; /* block counter */ - int nbufs; /* num bufs of new inodes */ xfs_agino_t newino; /* new first inode's number */ xfs_agino_t newlen; /* new number of inodes */ - int ninodes; /* num inodes per buf */ - xfs_agino_t thisino; /* current inode number, for loop */ - int version; /* inode version number to use */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + struct xfs_perag *pag; + memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; @@ -144,35 +377,59 @@ xfs_ialloc_ag_alloc( * Locking will ensure that we don't have two callers in here * at one time. */ - newlen = XFS_IALLOC_INODES(args.mp); + newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) return XFS_ERROR(ENOSPC); - args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); + args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* * First try to allocate inodes contiguous with the last-allocated * chunk of inodes. If the filesystem is striped, this will fill * an entire stripe unit with inodes. - */ + */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); + agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.mp->m_ialloc_blks; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; - args.alignment = 1; + /* - * Allow space for the inode btree to split. + * We need to take into account alignment here to ensure that + * we don't modify the free list if we fail to have an exact + * block. If we don't have an exact match, and every oher + * attempt allocation attempt fails, we'll end up cancelling + * a dirty transaction and shutting down. + * + * For an exact allocation, alignment must be 1, + * however we need to take cluster alignment into account when + * fixing up the freelist. Use the minalignslop field to + * indicate that extra blocks might be required for alignment, + * but not to use them in the actual exact allocation. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.alignment = 1; + args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1; + + /* Allow space for the inode btree to split. */ + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; + + /* + * This request might have dirtied the transaction if the AG can + * satisfy the request, but the exact block was not available. + * If the allocation did fail, subsequent requests will relax + * the exact agbno requirement and increase the alignment + * instead. It is critical that the total size of the request + * (len + alignment + slop) does not increase from this point + * on, so reset minalignslop to ensure it is not included in + * subsequent requests. + */ + args.minalignslop = 0; } else args.fsbno = NULLFSBLOCK; @@ -190,32 +447,24 @@ xfs_ialloc_ag_alloc( ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); args.alignment = args.mp->m_dalign; isaligned = 1; - } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, - XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + } else + args.alignment = xfs_ialloc_cluster_alignment(&args); /* * Need to figure out where to allocate the inode blocks. * Ideally they should be spaced out through the a.g. * For now, just allocate blocks up front. */ args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); /* * Allocate a fixed-size extent of inodes. */ args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = - args.userdata = args.minalignslop = 0; args.prod = 1; /* * Allow space for the inode btree to split. */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + args.minleft = args.mp->m_in_maxlevels - 1; if ((error = xfs_alloc_vextent(&args))) return error; } @@ -227,14 +476,8 @@ xfs_ialloc_ag_alloc( if (isaligned && args.fsbno == NULLFSBLOCK) { args.type = XFS_ALLOCTYPE_NEAR_BNO; args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, - be32_to_cpu(agi->agi_seqno), args.agbno); - if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = xfs_ialloc_cluster_alignment(&args); if ((error = xfs_alloc_vextent(&args))) return error; } @@ -244,90 +487,46 @@ xfs_ialloc_ag_alloc( return 0; } ASSERT(args.len == args.minlen); + /* - * Convert the results. - */ - newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); - /* - * Loop over the new block(s), filling in the inodes. - * For small block sizes, manipulate the inodes in buffers - * which are multiples of the blocks size. + * Stamp and write the inode buffers. + * + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. */ - if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) { - blks_per_cluster = 1; - nbufs = (int)args.len; - ninodes = args.mp->m_sb.sb_inopblock; - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) / - args.mp->m_sb.sb_blocksize; - nbufs = (int)args.len / blks_per_cluster; - ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock; - } + error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, + args.len, prandom_u32()); + + if (error) + return error; /* - * Figure out what version number to use in the inodes we create. - * If the superblock version has caught up to the one that supports - * the new inode format, then use the new inode version. Otherwise - * use the old version so that old kernels will continue to be - * able to use the file system. + * Convert the results. */ - if (XFS_SB_VERSION_HASNLINK(&args.mp->m_sb)) - version = XFS_DINODE_VERSION_2; - else - version = XFS_DINODE_VERSION_1; - - for (j = 0; j < nbufs; j++) { - /* - * Get the block. - */ - d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno), - args.agbno + (j * blks_per_cluster)); - fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d, - args.mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); - ASSERT(fbuf); - ASSERT(!XFS_BUF_GETERROR(fbuf)); - /* - * Set initial values for the inodes in this buffer. - */ - xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog); - for (i = 0; i < ninodes; i++) { - free = XFS_MAKE_IPTR(args.mp, fbuf, i); - INT_SET(free->di_core.di_magic, ARCH_CONVERT, XFS_DINODE_MAGIC); - INT_SET(free->di_core.di_version, ARCH_CONVERT, version); - INT_SET(free->di_next_unlinked, ARCH_CONVERT, NULLAGINO); - xfs_ialloc_log_di(tp, fbuf, i, - XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED); - } - xfs_trans_inode_alloc_buf(tp, fbuf); - } - be32_add(&agi->agi_count, newlen); - be32_add(&agi->agi_freecount, newlen); - down_read(&args.mp->m_peraglock); - args.mp->m_perag[be32_to_cpu(agi->agi_seqno)].pagi_freecount += newlen; - up_read(&args.mp->m_peraglock); + newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + be32_add_cpu(&agi->agi_count, newlen); + be32_add_cpu(&agi->agi_freecount, newlen); + pag = xfs_perag_get(args.mp, agno); + pag->pagi_freecount += newlen; + xfs_perag_put(pag); agi->agi_newino = cpu_to_be32(newino); + /* - * Insert records describing the new inode chunk into the btree. + * Insert records describing the new inode chunk into the btrees. */ - cur = xfs_btree_init_cursor(args.mp, tp, agbp, - be32_to_cpu(agi->agi_seqno), - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); - for (thisino = newino; - thisino < newino + newlen; - thisino += XFS_INODES_PER_CHUNK) { - if ((error = xfs_inobt_lookup_eq(cur, thisino, - XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; - } - ASSERT(i == 0); - if ((error = xfs_inobt_insert(cur, &i))) { - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_FINO); + if (error) return error; - } - ASSERT(i == 1); } - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); /* * Log allocation group header fields */ @@ -342,7 +541,7 @@ xfs_ialloc_ag_alloc( return 0; } -STATIC __inline xfs_agnumber_t +STATIC xfs_agnumber_t xfs_ialloc_next_ag( xfs_mount_t *mp) { @@ -350,7 +549,7 @@ xfs_ialloc_next_ag( spin_lock(&mp->m_agirotor_lock); agno = mp->m_agirotor; - if (++mp->m_agirotor == mp->m_maxagi) + if (++mp->m_agirotor >= mp->m_maxagi) mp->m_agirotor = 0; spin_unlock(&mp->m_agirotor_lock); @@ -359,16 +558,15 @@ xfs_ialloc_next_ag( /* * Select an allocation group to look for a free inode in, based on the parent - * inode and then mode. Return the allocation group buffer. + * inode and the mode. Return the allocation group buffer. */ -STATIC xfs_buf_t * /* allocation group buffer */ +STATIC xfs_agnumber_t xfs_ialloc_ag_select( xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t parent, /* parent directory inode number */ - mode_t mode, /* bits set to indicate file type */ + umode_t mode, /* bits set to indicate file type */ int okalloc) /* ok to allocate more space */ { - xfs_buf_t *agbp; /* allocation group header buffer */ xfs_agnumber_t agcount; /* number of ag's in the filesystem */ xfs_agnumber_t agno; /* current ag number */ int flags; /* alloc buffer locking flags */ @@ -378,6 +576,7 @@ xfs_ialloc_ag_select( int needspace; /* file mode implies space allocated */ xfs_perag_t *pag; /* per allocation group data */ xfs_agnumber_t pagno; /* parent (starting) ag number */ + int error; /* * Files of these types need at least one block if length > 0 @@ -393,7 +592,9 @@ xfs_ialloc_ag_select( if (pagno >= agcount) pagno = 0; } + ASSERT(pagno < agcount); + /* * Loop through allocation groups, looking for one with a little * free space in it. Note we don't look for free inodes, exactly. @@ -403,637 +604,876 @@ xfs_ialloc_ag_select( */ agno = pagno; flags = XFS_ALLOC_FLAG_TRYLOCK; - down_read(&mp->m_peraglock); for (;;) { - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + if (!pag->pagi_init) { - if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) goto nextag; - } - } else - agbp = NULL; + } - if (!pag->pagi_inodeok) { - xfs_ialloc_next_ag(mp); - goto unlock_nextag; + if (pag->pagi_freecount) { + xfs_perag_put(pag); + return agno; } - /* - * Is there enough free space for the file plus a block - * of inodes (if we need to allocate some)? - */ - ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp); - if (ineed && !pag->pagf_init) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; + if (!okalloc) + goto nextag; + + if (!pag->pagf_init) { + error = xfs_alloc_pagf_init(mp, tp, agno, flags); + if (error) goto nextag; - } - (void)xfs_alloc_pagf_init(mp, tp, agno, flags); } - if (!ineed || pag->pagf_init) { - if (ineed && !(longest = pag->pagf_longest)) - longest = pag->pagf_flcount > 0; - if (!ineed || - (pag->pagf_freeblks >= needspace + ineed && - longest >= ineed && - okalloc)) { - if (agbp == NULL && - xfs_ialloc_read_agi(mp, tp, agno, &agbp)) { - agbp = NULL; - goto nextag; - } - up_read(&mp->m_peraglock); - return agbp; - } + + /* + * Is there enough free space for the file plus a block of + * inodes? (if we need to allocate some)? + */ + ineed = mp->m_ialloc_blks; + longest = pag->pagf_longest; + if (!longest) + longest = pag->pagf_flcount > 0; + + if (pag->pagf_freeblks >= needspace + ineed && + longest >= ineed) { + xfs_perag_put(pag); + return agno; } -unlock_nextag: - if (agbp) - xfs_trans_brelse(tp, agbp); nextag: + xfs_perag_put(pag); /* * No point in iterating over the rest, if we're shutting * down. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; - } + if (XFS_FORCED_SHUTDOWN(mp)) + return NULLAGNUMBER; agno++; if (agno >= agcount) agno = 0; if (agno == pagno) { - if (flags == 0) { - up_read(&mp->m_peraglock); - return (xfs_buf_t *)0; - } + if (flags == 0) + return NULLAGNUMBER; flags = 0; } } } /* - * Visible inode allocation functions. + * Try to retrieve the next record to the left/right from the current one. */ - -/* - * Allocate an inode on disk. - * Mode is used to tell whether the new inode will need space, and whether - * it is a directory. - * - * The arguments IO_agbp and alloc_done are defined to work within - * the constraint of one allocation per transaction. - * xfs_dialloc() is designed to be called twice if it has to do an - * allocation to make more free inodes. On the first call, - * IO_agbp should be set to NULL. If an inode is available, - * i.e., xfs_dialloc() did not need to do an allocation, an inode - * number is returned. In this case, IO_agbp would be set to the - * current ag_buf and alloc_done set to false. - * If an allocation needed to be done, xfs_dialloc would return - * the current ag_buf in IO_agbp and set alloc_done to true. - * The caller should then commit the current transaction, allocate a new - * transaction, and call xfs_dialloc() again, passing in the previous - * value of IO_agbp. IO_agbp should be held across the transactions. - * Since the agbp is locked across the two calls, the second call is - * guaranteed to have a free inode available. - * - * Once we successfully pick an inode its number is returned and the - * on-disk data structures are updated. The inode itself is not read - * in, since doing so would break ordering constraints with xfs_reclaim. - */ -int -xfs_dialloc( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ - int okalloc, /* ok to allocate more space */ - xfs_buf_t **IO_agbp, /* in/out ag header's buffer */ - boolean_t *alloc_done, /* true if we needed to replenish - inode freelist */ - xfs_ino_t *inop) /* inode number allocated */ +STATIC int +xfs_ialloc_next_rec( + struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, + int *done, + int left) { - xfs_agnumber_t agcount; /* number of allocation groups */ - xfs_buf_t *agbp; /* allocation group header's buffer */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header structure */ - xfs_btree_cur_t *cur; /* inode allocation btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ialloced; /* inode allocation status */ - int noroom = 0; /* no space for inode blk allocation */ - xfs_ino_t ino; /* fs-relative inode to be returned */ - /* REFERENCED */ - int j; /* result code */ - xfs_mount_t *mp; /* file system mount structure */ - int offset; /* index of inode in chunk */ - xfs_agino_t pagino; /* parent's a.g. relative inode # */ - xfs_agnumber_t pagno; /* parent's allocation group number */ - xfs_inobt_rec_t rec; /* inode allocation record */ - xfs_agnumber_t tagno; /* testing allocation group number */ - xfs_btree_cur_t *tcur; /* temp cursor */ - xfs_inobt_rec_t trec; /* temp inode allocation record */ - - - if (*IO_agbp == NULL) { - /* - * We do not have an agbp, so select an initial allocation - * group for inode allocation. - */ - agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc); - /* - * Couldn't find an allocation group satisfying the - * criteria, give up. - */ - if (!agbp) { - *inop = NULLFSINO; - return 0; - } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); - } else { - /* - * Continue where we left off before. In this case, we - * know that the allocation group has free inodes. - */ - agbp = *IO_agbp; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - } - mp = tp->t_mountp; - agcount = mp->m_sb.sb_agcount; - agno = be32_to_cpu(agi->agi_seqno); - tagno = agno; - pagno = XFS_INO_TO_AGNO(mp, parent); - pagino = XFS_INO_TO_AGINO(mp, parent); + int error; + int i; - /* - * If we have already hit the ceiling of inode blocks then clear - * okalloc so we scan all available agi structures for a free - * inode. - */ + if (left) + error = xfs_btree_decrement(cur, 0, &i); + else + error = xfs_btree_increment(cur, 0, &i); - if (mp->m_maxicount && - mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) { - noroom = 1; - okalloc = 0; + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); } - /* - * Loop until we find an allocation group that either has free inodes - * or in which we can allocate some inodes. Iterate through the - * allocation groups upward, wrapping at the end. - */ - *alloc_done = B_FALSE; - while (!agi->agi_freecount) { - /* - * Don't do anything if we're not supposed to allocate - * any blocks, just go on to the next ag. - */ - if (okalloc) { - /* - * Try to allocate some new inodes in the allocation - * group. - */ - if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) { - xfs_trans_brelse(tp, agbp); - if (error == ENOSPC) { - *inop = NULLFSINO; - return 0; - } else - return error; - } - if (ialloced) { - /* - * We successfully allocated some inodes, return - * the current context to the caller so that it - * can commit the current transaction and call - * us again where we left off. - */ - ASSERT(be32_to_cpu(agi->agi_freecount) > 0); - *alloc_done = B_TRUE; - *IO_agbp = agbp; - *inop = NULLFSINO; - return 0; - } - } - /* - * If it failed, give up on this ag. - */ - xfs_trans_brelse(tp, agbp); - /* - * Go on to the next ag: get its ag header. - */ -nextag: - if (++tagno == agcount) - tagno = 0; - if (tagno == agno) { - *inop = NULLFSINO; - return noroom ? ENOSPC : 0; - } - down_read(&mp->m_peraglock); - if (mp->m_perag[tagno].pagi_inodeok == 0) { - up_read(&mp->m_peraglock); - goto nextag; - } - error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp); - up_read(&mp->m_peraglock); + return 0; +} + +STATIC int +xfs_ialloc_get_rec( + struct xfs_btree_cur *cur, + xfs_agino_t agino, + xfs_inobt_rec_incore_t *rec, + int *done) +{ + int error; + int i; + + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + *done = !i; + if (i) { + error = xfs_inobt_get_rec(cur, rec, &i); if (error) - goto nextag; - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); } - /* - * Here with an allocation group that has a free inode. - * Reset agno since we may have chosen a new ag in the - * loop above. - */ - agno = tagno; - *IO_agbp = NULL; - cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno), - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); + + return 0; +} + +/* + * Allocate an inode using the inobt-only algorithm. + */ +STATIC int +xfs_dialloc_ag_inobt( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur, *tcur; + struct xfs_inobt_rec_incore rec, trec; + xfs_ino_t ino; + int error; + int offset; + int i, j; + + pag = xfs_perag_get(mp, agno); + + ASSERT(pag->pagi_init); + ASSERT(pag->pagi_inodeok); + ASSERT(pag->pagi_freecount > 0); + + restart_pagno: + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); /* * If pagino is 0 (this is the root inode allocation) use newino. * This must work because we've just allocated some. */ if (!pagino) pagino = be32_to_cpu(agi->agi_newino); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; - } while (i == 1); + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif /* - * If in the same a.g. as the parent, try to get near the parent. + * If in the same AG as the parent, try to get near the parent. */ if (pagno == agno) { - if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) + int doneleft; /* done, to the left */ + int doneright; /* done, to the right */ + int searchdistance = 10; + + error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); + if (error) goto error0; - if (i != 0 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + error = xfs_inobt_get_rec(cur, &rec, &j); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(j == 1, error0); + + if (rec.ir_freecount > 0) { /* * Found a free inode in the same chunk - * as parent, done. + * as the parent, done. */ + goto alloc_inode; } + + /* - * In the same a.g. as parent, but parent's chunk is full. + * In the same AG as parent, but parent's chunk is full. */ - else { - int doneleft; /* done, to the left */ - int doneright; /* done, to the right */ + /* duplicate the cursor, search left & right simultaneously */ + error = xfs_btree_dup_cursor(cur, &tcur); + if (error) + goto error0; + + /* + * Skip to last blocks looked up if same parent inode. + */ + if (pagino != NULLAGINO && + pag->pagl_pagino == pagino && + pag->pagl_leftrec != NULLAGINO && + pag->pagl_rightrec != NULLAGINO) { + error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec, + &trec, &doneleft); if (error) - goto error0; - ASSERT(i == 1); - ASSERT(j == 1); - /* - * Duplicate the cursor, search left & right - * simultaneously. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - goto error0; - /* - * Search left with tcur, back up 1 record. - */ - if ((error = xfs_inobt_decrement(tcur, 0, &i))) goto error1; - doneleft = !i; - if (!doneleft) { - if ((error = xfs_inobt_get_rec(tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Search right with cur, go forward 1 record. - */ - if ((error = xfs_inobt_increment(cur, 0, &i))) + + error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec, + &rec, &doneright); + if (error) + goto error1; + } else { + /* search left with tcur, back up 1 record */ + error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1); + if (error) goto error1; - doneright = !i; - if (!doneright) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, error1); - } - /* - * Loop until we find the closest inode chunk - * with a free one. - */ - while (!doneleft || !doneright) { - int useleft; /* using left inode - chunk this time */ + /* search right with cur, go forward 1 record. */ + error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0); + if (error) + goto error1; + } + + /* + * Loop until we find an inode chunk with a free inode. + */ + while (!doneleft || !doneright) { + int useleft; /* using left inode chunk this time */ + + if (!--searchdistance) { /* - * Figure out which block is closer, - * if both are valid. - */ - if (!doneleft && !doneright) - useleft = - pagino - - (trec.ir_startino + - XFS_INODES_PER_CHUNK - 1) < - rec.ir_startino - pagino; - else - useleft = !doneleft; - /* - * If checking the left, does it have - * free inodes? - */ - if (useleft && trec.ir_freecount) { - /* - * Yes, set it up as the chunk to use. - */ - rec = trec; - xfs_btree_del_cursor(cur, - XFS_BTREE_NOERROR); - cur = tcur; - break; - } - /* - * If checking the right, does it have - * free inodes? - */ - if (!useleft && rec.ir_freecount) { - /* - * Yes, it's already set up. - */ - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - break; - } - /* - * If used the left, get another one - * further left. - */ - if (useleft) { - if ((error = xfs_inobt_decrement(tcur, 0, - &i))) - goto error1; - doneleft = !i; - if (!doneleft) { - if ((error = xfs_inobt_get_rec( - tcur, - &trec.ir_startino, - &trec.ir_freecount, - &trec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } - /* - * If used the right, get another one - * further right. + * Not in range - save last search + * location and allocate a new inode */ - else { - if ((error = xfs_inobt_increment(cur, 0, - &i))) - goto error1; - doneright = !i; - if (!doneright) { - if ((error = xfs_inobt_get_rec( - cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error1; - XFS_WANT_CORRUPTED_GOTO(i == 1, - error1); - } - } + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto newino; } - ASSERT(!doneleft || !doneright); + + /* figure out the closer block if both are valid. */ + if (!doneleft && !doneright) { + useleft = pagino - + (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) < + rec.ir_startino - pagino; + } else { + useleft = !doneleft; + } + + /* free inodes to the left? */ + if (useleft && trec.ir_freecount) { + rec = trec; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + cur = tcur; + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* free inodes to the right? */ + if (!useleft && rec.ir_freecount) { + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + goto alloc_inode; + } + + /* get next record to check */ + if (useleft) { + error = xfs_ialloc_next_rec(tcur, &trec, + &doneleft, 1); + } else { + error = xfs_ialloc_next_rec(cur, &rec, + &doneright, 0); + } + if (error) + goto error1; } + + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; } + /* - * In a different a.g. from the parent. + * In a different AG from the parent. * See if the most recently allocated block has any free. */ - else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { - if ((error = xfs_inobt_lookup_eq(cur, - be32_to_cpu(agi->agi_newino), 0, 0, &i))) +newino: + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); + if (error) goto error0; - if (i == 1 && - (error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &j)) == 0 && - j == 1 && - rec.ir_freecount > 0) { - /* - * The last chunk allocated in the group still has - * a free inode. - */ - } - /* - * None left in the last group, search the whole a.g. - */ - else { + + if (i == 1) { + error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - ASSERT(i == 1); - for (;;) { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, - &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (rec.ir_freecount > 0) - break; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + if (j == 1 && rec.ir_freecount > 0) { + /* + * The last chunk allocated in the group + * still has a free inode. + */ + goto alloc_inode; } } } - offset = XFS_IALLOC_FIND_FREE(&rec.ir_free); + + /* + * None left in the last group, search the whole AG + */ + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + for (;;) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + if (rec.ir_freecount > 0) + break; + error = xfs_btree_increment(cur, 0, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + } + +alloc_inode: + offset = xfs_lowbit64(rec.ir_free); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); - XFS_INOBT_CLR_FREE(&rec, offset); + rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, - rec.ir_free))) + error = xfs_inobt_update(cur, &rec); + if (error) goto error0; - be32_add(&agi->agi_freecount, -1); + be32_add_cpu(&agi->agi_freecount, -1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[tagno].pagi_freecount--; - up_read(&mp->m_peraglock); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; + pag->pagi_freecount--; + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + xfs_perag_put(pag); *inop = ino; return 0; error1: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); error0: xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); return error; } /* - * Free disk inode. Carefully avoids touching the incore inode, all - * manipulations incore are the caller's responsibility. - * The on-disk inode is not changed by this operation, only the - * btree (free inode mask) is changed. + * Use the free inode btree to allocate an inode based on distance from the + * parent. Note that the provided cursor may be deleted and replaced. */ -int -xfs_difree( - xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t inode, /* inode to be freed */ - xfs_bmap_free_t *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino) /* first inode in deleted cluster */ +STATIC int +xfs_dialloc_ag_finobt_near( + xfs_agino_t pagino, + struct xfs_btree_cur **ocur, + struct xfs_inobt_rec_incore *rec) { - /* REFERENCED */ - xfs_agblock_t agbno; /* block number containing inode */ - xfs_buf_t *agbp; /* buffer containing allocation group header */ - xfs_agino_t agino; /* inode number relative to allocation group */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_agi_t *agi; /* allocation group header */ - xfs_btree_cur_t *cur; /* inode btree cursor */ - int error; /* error return value */ - int i; /* result code */ - int ilen; /* inodes in an inode cluster */ - xfs_mount_t *mp; /* mount structure for filesystem */ - int off; /* offset of inode in inode chunk */ - xfs_inobt_rec_t rec; /* btree record */ + struct xfs_btree_cur *lcur = *ocur; /* left search cursor */ + struct xfs_btree_cur *rcur; /* right search cursor */ + struct xfs_inobt_rec_incore rrec; + int error; + int i, j; - mp = tp->t_mountp; + error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i); + if (error) + return error; + + if (i == 1) { + error = xfs_inobt_get_rec(lcur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + /* + * See if we've landed in the parent inode record. The finobt + * only tracks chunks with at least one free inode, so record + * existence is enough. + */ + if (pagino >= rec->ir_startino && + pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK)) + return 0; + } + + error = xfs_btree_dup_cursor(lcur, &rcur); + if (error) + return error; + + error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j); + if (error) + goto error_rcur; + if (j == 1) { + error = xfs_inobt_get_rec(rcur, &rrec, &j); + if (error) + goto error_rcur; + XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + } + + XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + if (i == 1 && j == 1) { + /* + * Both the left and right records are valid. Choose the closer + * inode chunk to the target. + */ + if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) > + (rrec.ir_startino - pagino)) { + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else { + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + } else if (j == 1) { + /* only the right record is valid */ + *rec = rrec; + xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR); + *ocur = rcur; + } else if (i == 1) { + /* only the left record is valid */ + xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR); + } + + return 0; + +error_rcur: + xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR); + return error; +} + +/* + * Use the free inode btree to find a free inode based on a newino hint. If + * the hint is NULL, find the first free inode in the AG. + */ +STATIC int +xfs_dialloc_ag_finobt_newino( + struct xfs_agi *agi, + struct xfs_btree_cur *cur, + struct xfs_inobt_rec_incore *rec) +{ + int error; + int i; + + if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { + error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, + &i); + if (error) + return error; + if (i == 1) { + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; + } + } /* - * Break up inode number into its components. + * Find the first inode available in the AG. */ - agno = XFS_INO_TO_AGNO(mp, inode); - if (agno >= mp->m_sb.sb_agcount) { - cmn_err(CE_WARN, - "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", - agno, mp->m_sb.sb_agcount, mp->m_fsname); - ASSERT(0); - return XFS_ERROR(EINVAL); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + return 0; +} + +/* + * Update the inobt based on a modification made to the finobt. Also ensure that + * the records from both trees are equivalent post-modification. + */ +STATIC int +xfs_dialloc_ag_update_inobt( + struct xfs_btree_cur *cur, /* inobt cursor */ + struct xfs_inobt_rec_incore *frec, /* finobt record */ + int offset) /* inode offset */ +{ + struct xfs_inobt_rec_incore rec; + int error; + int i; + + error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(i == 1); + ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + + XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + (rec.ir_freecount == frec->ir_freecount)); + + error = xfs_inobt_update(cur, &rec); + if (error) + return error; + + return 0; +} + +/* + * Allocate an inode using the free inode btree, if available. Otherwise, fall + * back to the inobt search algorithm. + * + * The caller selected an AG for us, and made sure that free inodes are + * available. + */ +STATIC int +xfs_dialloc_ag( + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_ino_t parent, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent); + xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; /* finobt cursor */ + struct xfs_btree_cur *icur; /* inobt cursor */ + struct xfs_inobt_rec_incore rec; + xfs_ino_t ino; + int error; + int offset; + int i; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return xfs_dialloc_ag_inobt(tp, agbp, parent, inop); + + pag = xfs_perag_get(mp, agno); + + /* + * If pagino is 0 (this is the root inode allocation) use newino. + * This must work because we've just allocated some. + */ + if (!pagino) + pagino = be32_to_cpu(agi->agi_newino); + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_cur; + + /* + * The search algorithm depends on whether we're in the same AG as the + * parent. If so, find the closest available inode to the parent. If + * not, consider the agi hint or find the first free inode in the AG. + */ + if (agno == pagno) + error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec); + else + error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec); + if (error) + goto error_cur; + + offset = xfs_lowbit64(rec.ir_free); + ASSERT(offset >= 0); + ASSERT(offset < XFS_INODES_PER_CHUNK); + ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % + XFS_INODES_PER_CHUNK) == 0); + ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); + + /* + * Modify or remove the finobt record. + */ + rec.ir_free &= ~XFS_INOBT_MASK(offset); + rec.ir_freecount--; + if (rec.ir_freecount) + error = xfs_inobt_update(cur, &rec); + else + error = xfs_btree_delete(cur, &i); + if (error) + goto error_cur; + + /* + * The finobt has now been updated appropriately. We haven't updated the + * agi and superblock yet, so we can create an inobt cursor and validate + * the original freecount. If all is well, make the equivalent update to + * the inobt using the finobt record and offset information. + */ + icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + + error = xfs_dialloc_ag_update_inobt(icur, &rec, offset); + if (error) + goto error_icur; + + /* + * Both trees have now been updated. We must update the perag and + * superblock before we can check the freecount for each btree. + */ + be32_add_cpu(&agi->agi_freecount, -1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + pag->pagi_freecount--; + + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); + + error = xfs_check_agi_freecount(icur, agi); + if (error) + goto error_icur; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error_icur; + + xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + xfs_perag_put(pag); + *inop = ino; + return 0; + +error_icur: + xfs_btree_del_cursor(icur, XFS_BTREE_ERROR); +error_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + xfs_perag_put(pag); + return error; +} + +/* + * Allocate an inode on disk. + * + * Mode is used to tell whether the new inode will need space, and whether it + * is a directory. + * + * This function is designed to be called twice if it has to do an allocation + * to make more free inodes. On the first call, *IO_agbp should be set to NULL. + * If an inode is available without having to performn an allocation, an inode + * number is returned. In this case, *IO_agbp is set to NULL. If an allocation + * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp. + * The caller should then commit the current transaction, allocate a + * new transaction, and call xfs_dialloc() again, passing in the previous value + * of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI + * buffer is locked across the two calls, the second call is guaranteed to have + * a free inode available. + * + * Once we successfully pick an inode its number is returned and the on-disk + * data structures are updated. The inode itself is not read in, since doing so + * would break ordering constraints with xfs_reclaim. + */ +int +xfs_dialloc( + struct xfs_trans *tp, + xfs_ino_t parent, + umode_t mode, + int okalloc, + struct xfs_buf **IO_agbp, + xfs_ino_t *inop) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agbp; + xfs_agnumber_t agno; + int error; + int ialloced; + int noroom = 0; + xfs_agnumber_t start_agno; + struct xfs_perag *pag; + + if (*IO_agbp) { + /* + * If the caller passes in a pointer to the AGI buffer, + * continue where we left off before. In this case, we + * know that the allocation group has free inodes. + */ + agbp = *IO_agbp; + goto out_alloc; } - agino = XFS_INO_TO_AGINO(mp, inode); - if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { - cmn_err(CE_WARN, - "xfs_difree: inode != XFS_AGINO_TO_INO() " - "(%llu != %llu) on %s. Returning EINVAL.", - (unsigned long long)inode, - (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino), - mp->m_fsname); - ASSERT(0); - return XFS_ERROR(EINVAL); + + /* + * We do not have an agbp, so select an initial allocation + * group for inode allocation. + */ + start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc); + if (start_agno == NULLAGNUMBER) { + *inop = NULLFSINO; + return 0; } - agbno = XFS_AGINO_TO_AGBNO(mp, agino); - if (agbno >= mp->m_sb.sb_agblocks) { - cmn_err(CE_WARN, - "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", - agbno, mp->m_sb.sb_agblocks, mp->m_fsname); - ASSERT(0); - return XFS_ERROR(EINVAL); + + /* + * If we have already hit the ceiling of inode blocks then clear + * okalloc so we scan all available agi structures for a free + * inode. + */ + if (mp->m_maxicount && + mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { + noroom = 1; + okalloc = 0; } + /* - * Get the allocation group header. + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. */ - down_read(&mp->m_peraglock); - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); - if (error) { - cmn_err(CE_WARN, - "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", - error, mp->m_fsname); - return error; + agno = start_agno; + for (;;) { + pag = xfs_perag_get(mp, agno); + if (!pag->pagi_inodeok) { + xfs_ialloc_next_ag(mp); + goto nextag; + } + + if (!pag->pagi_init) { + error = xfs_ialloc_pagi_init(mp, tp, agno); + if (error) + goto out_error; + } + + /* + * Do a first racy fast path check if this AG is usable. + */ + if (!pag->pagi_freecount && !okalloc) + goto nextag; + + /* + * Then read in the AGI buffer and recheck with the AGI buffer + * lock held. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) + goto out_error; + + if (pag->pagi_freecount) { + xfs_perag_put(pag); + goto out_alloc; + } + + if (!okalloc) + goto nextag_relse_buffer; + + + error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced); + if (error) { + xfs_trans_brelse(tp, agbp); + + if (error != ENOSPC) + goto out_error; + + xfs_perag_put(pag); + *inop = NULLFSINO; + return 0; + } + + if (ialloced) { + /* + * We successfully allocated some inodes, return + * the current context to the caller so that it + * can commit the current transaction and call + * us again where we left off. + */ + ASSERT(pag->pagi_freecount > 0); + xfs_perag_put(pag); + + *IO_agbp = agbp; + *inop = NULLFSINO; + return 0; + } + +nextag_relse_buffer: + xfs_trans_brelse(tp, agbp); +nextag: + xfs_perag_put(pag); + if (++agno == mp->m_sb.sb_agcount) + agno = 0; + if (agno == start_agno) { + *inop = NULLFSINO; + return noroom ? ENOSPC : 0; + } } - agi = XFS_BUF_TO_AGI(agbp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); - ASSERT(agbno < be32_to_cpu(agi->agi_length)); + +out_alloc: + *IO_agbp = NULL; + return xfs_dialloc_ag(tp, agbp, parent, inop); +out_error: + xfs_perag_put(pag); + return XFS_ERROR(error); +} + +STATIC int +xfs_difree_inobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_bmap_free *flist, + int *deleted, + xfs_ino_t *first_ino, + struct xfs_inobt_rec_incore *orec) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_perag *pag; + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int ilen; + int error; + int i; + int off; + + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); + ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length)); + /* * Initialize the cursor. */ - cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i))) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif /* * Look for the entry describing this inode. */ - if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { - cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", - error, mp->m_fsname); + if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { + xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.", + __func__, error); goto error0; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, - &rec.ir_free, &i))) { - cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", + __func__, error); goto error0; } XFS_WANT_CORRUPTED_GOTO(i == 1, error0); @@ -1042,20 +1482,20 @@ xfs_difree( */ off = agino - rec.ir_startino; ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK); - ASSERT(!XFS_INOBT_IS_FREE(&rec, off)); + ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off))); /* * Mark the inode free & increment the count. */ - XFS_INOBT_SET_FREE(&rec, off); + rec.ir_free |= XFS_INOBT_MASK(off); rec.ir_freecount++; /* * When an inode cluster is free, it becomes eligible for removal */ - if ((mp->m_flags & XFS_MOUNT_IDELETE) && - (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + if (!(mp->m_flags & XFS_MOUNT_IKEEP) && + (rec.ir_freecount == mp->m_ialloc_inos)) { - *delete = 1; + *deleted = 1; *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); /* @@ -1063,67 +1503,51 @@ xfs_difree( * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = XFS_IALLOC_INODES(mp); - be32_add(&agi->agi_count, -ilen); - be32_add(&agi->agi_freecount, -(ilen - 1)); + ilen = mp->m_ialloc_inos; + be32_add_cpu(&agi->agi_count, -ilen); + be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount -= ilen - 1; - up_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount -= ilen - 1; + xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); - if ((error = xfs_inobt_delete(cur, &i))) { - cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n", - error, mp->m_fsname); + if ((error = xfs_btree_delete(cur, &i))) { + xfs_warn(mp, "%s: xfs_btree_delete returned error %d.", + __func__, error); goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, - agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), - XFS_IALLOC_BLOCKS(mp), flist, mp); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), + mp->m_ialloc_blks, flist, mp); } else { - *delete = 0; + *deleted = 0; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { - cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + error = xfs_inobt_update(cur, &rec); + if (error) { + xfs_warn(mp, "%s: xfs_inobt_update returned error %d.", + __func__, error); goto error0; } + /* * Change the inode free counts and log the ag/sb changes. */ - be32_add(&agi->agi_freecount, 1); + be32_add_cpu(&agi->agi_freecount, 1); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount++; - up_read(&mp->m_peraglock); + pag = xfs_perag_get(mp, agno); + pag->pagi_freecount++; + xfs_perag_put(pag); xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); } -#ifdef DEBUG - if (cur->bc_nlevels == 1) { - int freecount = 0; + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error0; - if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) - goto error0; - do { - if ((error = xfs_inobt_get_rec(cur, - &rec.ir_startino, - &rec.ir_freecount, - &rec.ir_free, &i))) - goto error0; - if (i) { - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; - } - } while (i == 1); - ASSERT(freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); - } -#endif + *orec = rec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); return 0; @@ -1133,36 +1557,264 @@ error0: } /* - * Return the location of the inode in bno/off, for mapping it into a buffer. + * Free an inode in the free inode btree. + */ +STATIC int +xfs_difree_finobt( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + xfs_agino_t agino, + struct xfs_inobt_rec_incore *ibtrec) /* inobt record */ +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + struct xfs_btree_cur *cur; + struct xfs_inobt_rec_incore rec; + int offset = agino - ibtrec->ir_startino; + int error; + int i; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO); + + error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + if (i == 0) { + /* + * If the record does not exist in the finobt, we must have just + * freed an inode in a previously fully allocated chunk. If not, + * something is out of sync. + */ + XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + + error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + ibtrec->ir_free, &i); + if (error) + goto error; + ASSERT(i == 1); + + goto out; + } + + /* + * Read and update the existing record. We could just copy the ibtrec + * across here, but that would defeat the purpose of having redundant + * metadata. By making the modifications independently, we can catch + * corruptions that we wouldn't see if we just copied from one record + * to another. + */ + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(i == 1, error); + + rec.ir_free |= XFS_INOBT_MASK(offset); + rec.ir_freecount++; + + XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + (rec.ir_freecount == ibtrec->ir_freecount), + error); + + /* + * The content of inobt records should always match between the inobt + * and finobt. The lifecycle of records in the finobt is different from + * the inobt in that the finobt only tracks records with at least one + * free inode. Hence, if all of the inodes are free and we aren't + * keeping inode chunks permanently on disk, remove the record. + * Otherwise, update the record with the new information. + */ + if (rec.ir_freecount == mp->m_ialloc_inos && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + error = xfs_btree_delete(cur, &i); + if (error) + goto error; + ASSERT(i == 1); + } else { + error = xfs_inobt_update(cur, &rec); + if (error) + goto error; + } + +out: + error = xfs_check_agi_freecount(cur, agi); + if (error) + goto error; + + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; + +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* + * Free disk inode. Carefully avoids touching the incore inode, all + * manipulations incore are the caller's responsibility. + * The on-disk inode is not changed by this operation, only the + * btree (free inode mask) is changed. + */ +int +xfs_difree( + struct xfs_trans *tp, /* transaction pointer */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *deleted,/* set if inode cluster was deleted */ + xfs_ino_t *first_ino)/* first inode in deleted cluster */ +{ + /* REFERENCED */ + xfs_agblock_t agbno; /* block number containing inode */ + struct xfs_buf *agbp; /* buffer for allocation group header */ + xfs_agino_t agino; /* allocation group inode number */ + xfs_agnumber_t agno; /* allocation group number */ + int error; /* error return value */ + struct xfs_mount *mp; /* mount structure for filesystem */ + struct xfs_inobt_rec_incore rec;/* btree record */ + + mp = tp->t_mountp; + + /* + * Break up inode number into its components. + */ + agno = XFS_INO_TO_AGNO(mp, inode); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", + __func__, agno, mp->m_sb.sb_agcount); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agino = XFS_INO_TO_AGINO(mp, inode); + if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { + xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).", + __func__, (unsigned long long)inode, + (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + agbno = XFS_AGINO_TO_AGBNO(mp, agino); + if (agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", + __func__, agbno, mp->m_sb.sb_agblocks); + ASSERT(0); + return XFS_ERROR(EINVAL); + } + /* + * Get the allocation group header. + */ + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.", + __func__, error); + return error; + } + + /* + * Fix up the inode allocation btree. + */ + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, + &rec); + if (error) + goto error0; + + /* + * Fix up the free inode btree. + */ + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + error = xfs_difree_finobt(mp, tp, agbp, agino, &rec); + if (error) + goto error0; + } + + return 0; + +error0: + return error; +} + +STATIC int +xfs_imap_lookup( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agino_t agino, + xfs_agblock_t agbno, + xfs_agblock_t *chunk_agbno, + xfs_agblock_t *offset_agbno, + int flags) +{ + struct xfs_inobt_rec_incore rec; + struct xfs_btree_cur *cur; + struct xfs_buf *agbp; + int error; + int i; + + error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); + if (error) { + xfs_alert(mp, + "%s: xfs_ialloc_read_agi() returned error %d, agno %d", + __func__, error, agno); + return error; + } + + /* + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. + */ + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); + if (!error) { + if (i) + error = xfs_inobt_get_rec(cur, &rec, &i); + if (!error && i == 0) + error = EINVAL; + } + + xfs_trans_brelse(tp, agbp); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + if (error) + return error; + + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + mp->m_ialloc_inos <= agino) + return EINVAL; + + /* for untrusted inodes check it is allocated first */ + if ((flags & XFS_IGET_UNTRUSTED) && + (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) + return EINVAL; + + *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); + *offset_agbno = agbno - *chunk_agbno; + return 0; +} + +/* + * Return the location of the inode in imap, for mapping it into a buffer. */ -/*ARGSUSED*/ int -xfs_dilocate( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ +xfs_imap( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ - xfs_fsblock_t *bno, /* output: block containing inode */ - int *len, /* output: num blocks in inode cluster */ - int *off, /* output: index in block of inode */ - uint flags) /* flags concerning inode lookup */ + struct xfs_imap *imap, /* location map structure */ + uint flags) /* flags for inode btree lookup */ { xfs_agblock_t agbno; /* block number of inode in the alloc group */ - xfs_buf_t *agbp; /* agi buffer */ xfs_agino_t agino; /* inode number within alloc group */ xfs_agnumber_t agno; /* allocation group number */ int blks_per_cluster; /* num blocks per inode cluster */ xfs_agblock_t chunk_agbno; /* first block in inode chunk */ - xfs_agino_t chunk_agino; /* first agino in inode chunk */ - __int32_t chunk_cnt; /* count of free inodes in chunk */ - xfs_inofree_t chunk_free; /* mask of free inodes in chunk */ xfs_agblock_t cluster_agbno; /* first block in inode cluster */ - xfs_btree_cur_t *cur; /* inode btree cursor */ int error; /* error code */ - int i; /* temp state */ int offset; /* index of inode in its buffer */ - int offset_agbno; /* blks from chunk start to inode */ + xfs_agblock_t offset_agbno; /* blks from chunk start to inode */ ASSERT(ino != NULLFSINO); + /* * Split up the inode number into its parts. */ @@ -1172,111 +1824,107 @@ xfs_dilocate( if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, agno, agino)) { #ifdef DEBUG - /* no diagnostics for bulkstat, ino comes from userspace */ - if (flags & XFS_IMAP_BULKSTAT) + /* + * Don't output diagnostic information for untrusted inodes + * as they can be invalid without implying corruption. + */ + if (flags & XFS_IGET_UNTRUSTED) return XFS_ERROR(EINVAL); if (agno >= mp->m_sb.sb_agcount) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: agno (%d) >= " - "mp->m_sb.sb_agcount (%d)", - agno, mp->m_sb.sb_agcount); + xfs_alert(mp, + "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", + __func__, agno, mp->m_sb.sb_agcount); } if (agbno >= mp->m_sb.sb_agblocks) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: agbno (0x%llx) >= " - "mp->m_sb.sb_agblocks (0x%lx)", - (unsigned long long) agbno, - (unsigned long) mp->m_sb.sb_agblocks); + xfs_alert(mp, + "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)", + __func__, (unsigned long long)agbno, + (unsigned long)mp->m_sb.sb_agblocks); } if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_dilocate: ino (0x%llx) != " - "XFS_AGINO_TO_INO(mp, agno, agino) " - "(0x%llx)", - ino, XFS_AGINO_TO_INO(mp, agno, agino)); + xfs_alert(mp, + "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)", + __func__, ino, + XFS_AGINO_TO_INO(mp, agno, agino)); } + xfs_stack_trace(); #endif /* DEBUG */ return XFS_ERROR(EINVAL); } - if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) || - !(flags & XFS_IMAP_LOOKUP)) { - offset = XFS_INO_TO_OFFSET(mp, ino); - ASSERT(offset < mp->m_sb.sb_inopblock); - *bno = XFS_AGB_TO_FSB(mp, agno, agbno); - *off = offset; - *len = 1; - return 0; + + blks_per_cluster = xfs_icluster_size_fsb(mp); + + /* + * For bulkstat and handle lookups, we have an untrusted inode number + * that we have to verify is valid. We cannot do this just by reading + * the inode buffer as it may have been unlinked and removed leaving + * inodes in stale state on disk. Hence we have to do a btree lookup + * in all cases where an untrusted inode number is passed. + */ + if (flags & XFS_IGET_UNTRUSTED) { + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); + if (error) + return error; + goto out_map; } - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog; - if (*bno != NULLFSBLOCK) { + + /* + * If the inode cluster size is the same as the blocksize or + * smaller we get to the buffer by simple arithmetics. + */ + if (blks_per_cluster == 1) { offset = XFS_INO_TO_OFFSET(mp, ino); ASSERT(offset < mp->m_sb.sb_inopblock); - cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno); - *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + - offset; - *len = blks_per_cluster; + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); + imap->im_len = XFS_FSB_TO_BB(mp, 1); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); return 0; } + + /* + * If the inode chunks are aligned then use simple maths to + * find the location. Otherwise we have to do a btree + * lookup to find the location. + */ if (mp->m_inoalign_mask) { offset_agbno = agbno & mp->m_inoalign_mask; chunk_agbno = agbno - offset_agbno; } else { - down_read(&mp->m_peraglock); - error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); - up_read(&mp->m_peraglock); - if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " - "xfs_ialloc_read_agi() returned " - "error %d, agno %d", - error, agno); -#endif /* DEBUG */ - return error; - } - cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); - if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " - "xfs_inobt_lookup_le() failed"); -#endif /* DEBUG */ - goto error0; - } - if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, - &chunk_free, &i))) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " - "xfs_inobt_get_rec() failed"); -#endif /* DEBUG */ - goto error0; - } - if (i == 0) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: " - "xfs_inobt_get_rec() failed"); -#endif /* DEBUG */ - error = XFS_ERROR(EINVAL); - } - xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + error = xfs_imap_lookup(mp, tp, agno, agino, agbno, + &chunk_agbno, &offset_agbno, flags); if (error) return error; - chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); - offset_agbno = agbno - chunk_agbno; } + +out_map: ASSERT(agbno >= chunk_agbno); cluster_agbno = chunk_agbno + ((offset_agbno / blks_per_cluster) * blks_per_cluster); offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) + XFS_INO_TO_OFFSET(mp, ino); - *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno); - *off = offset; - *len = blks_per_cluster; + + imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); + imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); + imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + + /* + * If the inode number maps to a block outside the bounds + * of the file system then return NULL rather than calling + * read_buf and panicing when we get an error from the + * driver. + */ + if ((imap->im_blkno + imap->im_len) > + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { + xfs_alert(mp, + "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)", + __func__, (unsigned long long) imap->im_blkno, + (unsigned long long) imap->im_len, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); + return XFS_ERROR(EINVAL); + } return 0; -error0: - xfs_trans_brelse(tp, agbp); - xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); - return error; } /* @@ -1303,7 +1951,16 @@ xfs_ialloc_compute_maxlevels( } /* - * Log specified fields for the ag hdr (inode section) + * Log specified fields for the ag hdr (inode section). The growth of the agi + * structure over time requires that we interpret the buffer as two logical + * regions delineated by the end of the unlinked list. This is due to the size + * of the hash table and its location in the middle of the agi. + * + * For example, a request to log a field before agi_unlinked and a field after + * agi_unlinked could cause us to log the entire hash table and use an excessive + * amount of log space. To avoid this behavior, log the region up through + * agi_unlinked in one call and the region after agi_unlinked through the end of + * the structure in another. */ void xfs_ialloc_log_agi( @@ -1326,86 +1983,207 @@ xfs_ialloc_log_agi( offsetof(xfs_agi_t, agi_newino), offsetof(xfs_agi_t, agi_dirino), offsetof(xfs_agi_t, agi_unlinked), + offsetof(xfs_agi_t, agi_free_root), + offsetof(xfs_agi_t, agi_free_level), sizeof(xfs_agi_t) }; #ifdef DEBUG xfs_agi_t *agi; /* allocation group header */ agi = XFS_BUF_TO_AGI(bp); - ASSERT(be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC); + ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); +#endif + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); + + /* + * Compute byte offsets for the first and last fields in the first + * region and log the agi buffer. This only logs up through + * agi_unlinked. + */ + if (fields & XFS_AGI_ALL_BITS_R1) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } + + /* + * Mask off the bits in the first region and calculate the first and + * last field offsets for any bits in the second region. + */ + fields &= ~XFS_AGI_ALL_BITS_R1; + if (fields) { + xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2, + &first, &last); + xfs_trans_log_buf(tp, bp, first, last); + } +} + +#ifdef DEBUG +STATIC void +xfs_check_agi_unlinked( + struct xfs_agi *agi) +{ + int i; + + for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) + ASSERT(agi->agi_unlinked[i]); +} +#else +#define xfs_check_agi_unlinked(agi) #endif + +static bool +xfs_agi_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_uuid)) + return false; /* - * Compute byte offsets for the first and last fields. + * Validate the magic number of the agi block. */ - xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last); + if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC)) + return false; + if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) + return false; + /* - * Log the allocation group inode header buffer. + * during growfs operations, the perag is not fully initialised, + * so we can't use it for any useful checking. growfs ensures we can't + * use it by using uncached buffers that don't have the perag attached + * so we can detect and avoid this problem. */ - xfs_trans_log_buf(tp, bp, first, last); + if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno) + return false; + + xfs_check_agi_unlinked(agi); + return true; } +static void +xfs_agi_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb) && + !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, + XFS_ERRTAG_IALLOC_READ_AGI, + XFS_RANDOM_IALLOC_READ_AGI)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_agi_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + if (!xfs_agi_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn); + xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF); +} + +const struct xfs_buf_ops xfs_agi_buf_ops = { + .verify_read = xfs_agi_read_verify, + .verify_write = xfs_agi_write_verify, +}; + /* * Read in the allocation group header (inode allocation section) */ int -xfs_ialloc_read_agi( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_agnumber_t agno, /* allocation group number */ - xfs_buf_t **bpp) /* allocation group hdr buf */ +xfs_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ { - xfs_agi_t *agi; /* allocation group header */ - int agi_ok; /* agi is consistent */ - xfs_buf_t *bp; /* allocation group hdr buf */ - xfs_perag_t *pag; /* per allocation group data */ - int error; + int error; + + trace_xfs_read_agi(mp, agno); ASSERT(agno != NULLAGNUMBER); - error = xfs_trans_read_buf( - mp, tp, mp->m_ddev_targp, + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp); + XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; - ASSERT(bp && !XFS_BUF_GETERROR(bp)); - /* - * Validate the magic number of the agi block. - */ - agi = XFS_BUF_TO_AGI(bp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI, - XFS_RANDOM_IALLOC_READ_AGI))) { - XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - pag = &mp->m_perag[agno]; + xfs_buf_set_ref(*bpp, XFS_AGI_REF); + return 0; +} + +int +xfs_ialloc_read_agi( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno, /* allocation group number */ + struct xfs_buf **bpp) /* allocation group hdr buf */ +{ + struct xfs_agi *agi; /* allocation group header */ + struct xfs_perag *pag; /* per allocation group data */ + int error; + + trace_xfs_ialloc_read_agi(mp, agno); + + error = xfs_read_agi(mp, tp, agno, bpp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(*bpp); + pag = xfs_perag_get(mp, agno); if (!pag->pagi_init) { pag->pagi_freecount = be32_to_cpu(agi->agi_freecount); + pag->pagi_count = be32_to_cpu(agi->agi_count); pag->pagi_init = 1; - } else { - /* - * It's possible for these to be out of sync if - * we are in the middle of a forced shutdown. - */ - ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); } -#ifdef DEBUG - { - int i; + /* + * It's possible for these to be out of sync if + * we are in the middle of a forced shutdown. + */ + ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || + XFS_FORCED_SHUTDOWN(mp)); + xfs_perag_put(pag); + return 0; +} - for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) - ASSERT(agi->agi_unlinked[i]); - } -#endif +/* + * Read in the agi to initialise the per-ag data in the mount structure + */ +int +xfs_ialloc_pagi_init( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_agnumber_t agno) /* allocation group number */ +{ + xfs_buf_t *bp = NULL; + int error; - XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF); - *bpp = bp; + error = xfs_ialloc_read_agi(mp, tp, agno, &bp); + if (error) + return error; + if (bp) + xfs_trans_brelse(tp, bp); return 0; } diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h index 7f5debe1acb..95ad1c002d6 100644 --- a/fs/xfs/xfs_ialloc.h +++ b/fs/xfs/xfs_ialloc.h @@ -20,49 +20,35 @@ struct xfs_buf; struct xfs_dinode; +struct xfs_imap; struct xfs_mount; struct xfs_trans; +struct xfs_btree_cur; -/* - * Allocation parameters for inode allocation. - */ -#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos -#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks - -/* - * For small block file systems, move inodes in clusters of this size. - * When we don't have a lot of memory, however, we go a bit smaller - * to reduce the number of AGI and ialloc btree blocks we need to keep - * around for xfs_dilocate(). We choose which one to use in - * xfs_mount_int(). - */ +/* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 -#define XFS_INODE_SMALL_CLUSTER_SIZE 4096 -#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size + +/* Calculate and return the number of filesystem blocks per inode cluster */ +static inline int +xfs_icluster_size_fsb( + struct xfs_mount *mp) +{ + if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size) + return 1; + return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog; +} /* * Make an inode pointer out of the buffer/offset. */ -#define XFS_MAKE_IPTR(mp,b,o) xfs_make_iptr(mp,b,o) static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (xfs_dinode_t *) + return (struct xfs_dinode *) (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); } /* - * Find a free (set) bit in the inode bitmask. - */ -#define XFS_IALLOC_FIND_FREE(fp) xfs_ialloc_find_free(fp) -static inline int xfs_ialloc_find_free(xfs_inofree_t *fp) -{ - return xfs_lowbit64(*fp); -} - - -#ifdef __KERNEL__ -/* * Allocate an inode on disk. * Mode is used to tell whether the new inode will need space, and whether * it is a directory. @@ -88,11 +74,9 @@ int /* error */ xfs_dialloc( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t parent, /* parent inode (directory) */ - mode_t mode, /* mode bits for new inode */ + umode_t mode, /* mode bits for new inode */ int okalloc, /* ok to allocate more space */ struct xfs_buf **agbp, /* buf for a.g. inode header */ - boolean_t *alloc_done, /* an allocation was done to replenish - the free inodes */ xfs_ino_t *inop); /* inode number allocated */ /* @@ -106,21 +90,18 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *delete, /* set if inode cluster was deleted */ + int *deleted, /* set if inode cluster was deleted */ xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* - * Return the location of the inode in bno/len/off, - * for mapping it into a buffer. + * Return the location of the inode in imap, for mapping it into a buffer. */ int -xfs_dilocate( +xfs_imap( struct xfs_mount *mp, /* file system mount structure */ struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t ino, /* inode to locate */ - xfs_fsblock_t *bno, /* output: block containing inode */ - int *len, /* output: num blocks in cluster*/ - int *off, /* output: index in block of inode */ + struct xfs_imap *imap, /* location map structure */ uint flags); /* flags for inode btree lookup */ /* @@ -149,6 +130,34 @@ xfs_ialloc_read_agi( xfs_agnumber_t agno, /* allocation group number */ struct xfs_buf **bpp); /* allocation group hdr buf */ -#endif /* __KERNEL__ */ +/* + * Read in the allocation group header to initialise the per-ag data + * in the mount structure + */ +int +xfs_ialloc_pagi_init( + struct xfs_mount *mp, /* file system mount structure */ + struct xfs_trans *tp, /* transaction pointer */ + xfs_agnumber_t agno); /* allocation group number */ + +/* + * Lookup a record by ino in the btree given by cur. + */ +int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino, + xfs_lookup_t dir, int *stat); + +/* + * Get the data from the pointed-to record. + */ +int xfs_inobt_get_rec(struct xfs_btree_cur *cur, + xfs_inobt_rec_incore_t *rec, int *stat); + +/* + * Inode chunk initialisation routine + */ +int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, + struct list_head *buffer_list, + xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_agblock_t length, unsigned int gen); #endif /* __XFS_IALLOC_H__ */ diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c index 616eeeb6953..726f83a681a 100644 --- a/fs/xfs/xfs_ialloc_btree.c +++ b/fs/xfs/xfs_ialloc_btree.c @@ -17,2062 +17,406 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_alloc.h" #include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" -STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int); -STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int); -STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *); -STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *); -STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *, - xfs_inobt_key_t *, xfs_btree_cur_t **, int *); -STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int); -/* - * Single level of the xfs_inobt_delete record deletion routine. - * Delete record pointed to by cur/level. - * Remove the record from its block then rebalance the tree. - * Return 0 for error, 1 for done, 2 to go on to the next level. - */ -STATIC int /* error */ -xfs_inobt_delrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level removing record from */ - int *stat) /* fail/done/go-on */ +STATIC int +xfs_inobt_get_minrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_buf_t *agbp; /* buffer for a.g. inode header */ - xfs_mount_t *mp; /* mount structure */ - xfs_agi_t *agi; /* allocation group inode header */ - xfs_inobt_block_t *block; /* btree block record/key lives in */ - xfs_agblock_t bno; /* btree block number */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* kp points here if block is level 0 */ - xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */ - xfs_agblock_t lbno; /* left block's block number */ - xfs_buf_t *lbp; /* left block's buffer pointer */ - xfs_inobt_block_t *left; /* left btree block */ - xfs_inobt_key_t *lkp; /* left block key pointer */ - xfs_inobt_ptr_t *lpp; /* left block address pointer */ - int lrecs = 0; /* number of records in left block */ - xfs_inobt_rec_t *lrp; /* left block record pointer */ - xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_agblock_t rbno; /* right block's block number */ - xfs_buf_t *rbp; /* right block's buffer pointer */ - xfs_inobt_block_t *right; /* right btree block */ - xfs_inobt_key_t *rkp; /* right block key pointer */ - xfs_inobt_rec_t *rp; /* pointer to btree records */ - xfs_inobt_ptr_t *rpp; /* right block address pointer */ - int rrecs = 0; /* number of records in right block */ - int numrecs; - xfs_inobt_rec_t *rrp; /* right block record pointer */ - xfs_btree_cur_t *tcur; /* temporary btree cursor */ - - mp = cur->bc_mp; + return cur->bc_mp->m_inobt_mnr[level != 0]; +} - /* - * Get the index of the entry being deleted, check for nothing there. - */ - ptr = cur->bc_ptrs[level]; - if (ptr == 0) { - *stat = 0; - return 0; - } +STATIC struct xfs_btree_cur * +xfs_inobt_dup_cursor( + struct xfs_btree_cur *cur) +{ + return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp, + cur->bc_private.a.agbp, cur->bc_private.a.agno, + cur->bc_btnum); +} - /* - * Get the buffer & block containing the record or key/ptr. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Fail if we're off the end of the block. - */ +STATIC void +xfs_inobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); - numrecs = be16_to_cpu(block->bb_numrecs); - if (ptr > numrecs) { - *stat = 0; - return 0; - } - /* - * It's a nonleaf. Excise the key and ptr being deleted, by - * sliding the entries past them down one. - * Log the changed areas of the block. - */ - if (level > 0) { - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = ptr; i < numrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level))) - return error; - } -#endif - if (ptr < numrecs) { - memmove(&kp[ptr - 1], &kp[ptr], - (numrecs - ptr) * sizeof(*kp)); - memmove(&pp[ptr - 1], &pp[ptr], - (numrecs - ptr) * sizeof(*kp)); - xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1); - xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1); - } - } - /* - * It's a leaf. Excise the record being deleted, by sliding the - * entries past it down one. Log the changed areas of the block. - */ - else { - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - if (ptr < numrecs) { - memmove(&rp[ptr - 1], &rp[ptr], - (numrecs - ptr) * sizeof(*rp)); - xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1); - } - /* - * If it's the first record in the block, we'll need a key - * structure to pass up to the next level (updkey). - */ - if (ptr == 1) { - key.ir_startino = rp->ir_startino; - kp = &key; - } - } - /* - * Decrement and log the number of entries in the block. - */ - numrecs--; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); - /* - * Is this the root level? If so, we're almost done. - */ - if (level == cur->bc_nlevels - 1) { - /* - * If this is the root level, - * and there's only one entry left, - * and it's NOT the leaf level, - * then we can get rid of this level. - */ - if (numrecs == 1 && level > 0) { - agbp = cur->bc_private.i.agbp; - agi = XFS_BUF_TO_AGI(agbp); - /* - * pp is still set to the first pointer in the block. - * Make it the new root of the btree. - */ - bno = be32_to_cpu(agi->agi_root); - agi->agi_root = *pp; - be32_add(&agi->agi_level, -1); - /* - * Free the block. - */ - if ((error = xfs_free_extent(cur->bc_tp, - XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) - return error; - xfs_trans_binval(cur->bc_tp, bp); - xfs_ialloc_log_agi(cur->bc_tp, agbp, - XFS_AGI_ROOT | XFS_AGI_LEVEL); - /* - * Update the cursor so there's one fewer level. - */ - cur->bc_bufs[level] = NULL; - cur->bc_nlevels--; - } else if (level > 0 && - (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * If we deleted the leftmost entry in the block, update the - * key values above us in the tree. - */ - if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1))) - return error; - /* - * If the number of records remaining in the block is at least - * the minimum, we're done. - */ - if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if (level > 0 && - (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * Otherwise, we have to move some records around to keep the - * tree balanced. Look at the left and right sibling blocks to - * see if we can re-balance by moving only one record. - */ - rbno = be32_to_cpu(block->bb_rightsib); - lbno = be32_to_cpu(block->bb_leftsib); - bno = NULLAGBLOCK; - ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK); - /* - * Duplicate the cursor so our btree manipulations here won't - * disrupt the next level up. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - /* - * If there's a right sibling, see if it's ok to shift an entry - * out of it. - */ - if (rbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the last entry in the next block. - * Actually any entry but the first would suffice. - */ - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if ((error = xfs_inobt_increment(tcur, level, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - /* - * Grab a pointer to the block. - */ - rbp = tcur->bc_bufs[level]; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(right->bb_leftsib); - /* - * If right block is full enough so that removing one entry - * won't make it too empty, and left-shifting an entry out - * of right to us works, we're done. - */ - if (be16_to_cpu(right->bb_numrecs) - 1 >= - XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_inobt_lshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_INOBT_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level > 0 && - (error = xfs_inobt_decrement(cur, level, - &i))) - return error; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference, and fix up the temp cursor to point - * to our block again (last record). - */ - rrecs = be16_to_cpu(right->bb_numrecs); - if (lbno != NULLAGBLOCK) { - xfs_btree_firstrec(tcur, level); - if ((error = xfs_inobt_decrement(tcur, level, &i))) - goto error0; - } - } - /* - * If there's a left sibling, see if it's ok to shift an entry - * out of it. - */ - if (lbno != NULLAGBLOCK) { - /* - * Move the temp cursor to the first entry in the - * previous block. - */ - xfs_btree_firstrec(tcur, level); - if ((error = xfs_inobt_decrement(tcur, level, &i))) - goto error0; - xfs_btree_firstrec(tcur, level); - /* - * Grab a pointer to the block. - */ - lbp = tcur->bc_bufs[level]; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - goto error0; -#endif - /* - * Grab the current block number, for future use. - */ - bno = be32_to_cpu(left->bb_rightsib); - /* - * If left block is full enough so that removing one entry - * won't make it too empty, and right-shifting an entry out - * of left to us works, we're done. - */ - if (be16_to_cpu(left->bb_numrecs) - 1 >= - XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if ((error = xfs_inobt_rshift(tcur, level, &i))) - goto error0; - if (i) { - ASSERT(be16_to_cpu(block->bb_numrecs) >= - XFS_INOBT_BLOCK_MINRECS(level, cur)); - xfs_btree_del_cursor(tcur, - XFS_BTREE_NOERROR); - if (level == 0) - cur->bc_ptrs[0]++; - *stat = 1; - return 0; - } - } - /* - * Otherwise, grab the number of records in right for - * future reference. - */ - lrecs = be16_to_cpu(left->bb_numrecs); - } - /* - * Delete the temp cursor, we're done with it. - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - /* - * If here, we need to do a join to keep the tree balanced. - */ - ASSERT(bno != NULLAGBLOCK); - /* - * See if we can join with the left neighbor block. - */ - if (lbno != NULLAGBLOCK && - lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * Set "right" to be the starting block, - * "left" to be the left neighbor. - */ - rbno = bno; - right = block; - rrecs = be16_to_cpu(right->bb_numrecs); - rbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, lbno, 0, &lbp, - XFS_INO_BTREE_REF))) - return error; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - lrecs = be16_to_cpu(left->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - } - /* - * If that won't work, see if we can join with the right neighbor block. - */ - else if (rbno != NULLAGBLOCK && - rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * Set "left" to be the starting block, - * "right" to be the right neighbor. - */ - lbno = bno; - left = block; - lrecs = be16_to_cpu(left->bb_numrecs); - lbp = bp; - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, rbno, 0, &rbp, - XFS_INO_BTREE_REF))) - return error; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - rrecs = be16_to_cpu(right->bb_numrecs); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - } - /* - * Otherwise, we can't fix the imbalance. - * Just return. This is probably a logic error, but it's not fatal. - */ - else { - if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i))) - return error; - *stat = 1; - return 0; - } - /* - * We're now going to join "left" and "right" by moving all the stuff - * in "right" to "left" and deleting "right". - */ - if (level > 0) { - /* - * It's a non-leaf. Move keys and pointers. - */ - lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur); - lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < rrecs; i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memcpy(lkp, rkp, rrecs * sizeof(*lkp)); - memcpy(lpp, rpp, rrecs * sizeof(*lpp)); - xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); - xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); - } else { - /* - * It's a leaf. Move records. - */ - lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, rrecs * sizeof(*lrp)); - xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); - } - /* - * If we joined with the left neighbor, set the buffer in the - * cursor to the left block, and fix up the index. - */ - if (bp != lbp) { - xfs_btree_setbuf(cur, level, lbp); - cur->bc_ptrs[level] += lrecs; - } - /* - * If we joined with the right neighbor and there's a level above - * us, increment the cursor at that level. - */ - else if (level + 1 < cur->bc_nlevels && - (error = xfs_alloc_increment(cur, level + 1, &i))) - return error; - /* - * Fix up the number of records in the surviving block. - */ - lrecs += rrecs; - left->bb_numrecs = cpu_to_be16(lrecs); - /* - * Fix up the right block pointer in the surviving block, and log it. - */ - left->bb_rightsib = right->bb_rightsib; - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there is a right sibling now, make it point to the - * remaining block. - */ - if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) { - xfs_inobt_block_t *rrblock; - xfs_buf_t *rrbp; + agi->agi_root = nptr->s; + be32_add_cpu(&agi->agi_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL); +} - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0, - &rrbp, XFS_INO_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(lbno); - xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * Free the deleting block. - */ - if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, - cur->bc_private.i.agno, rbno), 1))) - return error; - xfs_trans_binval(cur->bc_tp, rbp); - /* - * Readjust the ptr at this level if it's not a leaf, since it's - * still pointing at the deletion point, which makes the cursor - * inconsistent. If this makes the ptr 0, the caller fixes it up. - * We can't use decrement because it would change the next level up. - */ - if (level > 0) - cur->bc_ptrs[level]--; - /* - * Return value means the next level up has something to do. - */ - *stat = 2; - return 0; +STATIC void +xfs_finobt_set_root( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *nptr, + int inc) /* level change */ +{ + struct xfs_buf *agbp = cur->bc_private.a.agbp; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); -error0: - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; + agi->agi_free_root = nptr->s; + be32_add_cpu(&agi->agi_free_level, inc); + xfs_ialloc_log_agi(cur->bc_tp, agbp, + XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL); } -/* - * Insert one record/level. Return information to the caller - * allowing the next level up to proceed if necessary. - */ -STATIC int /* error */ -xfs_inobt_insrec( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to insert record at */ - xfs_agblock_t *bnop, /* i/o: block number inserted */ - xfs_inobt_rec_t *recp, /* i/o: record data inserted */ - xfs_btree_cur_t **curp, /* output: new cursor replacing cur */ - int *stat) /* success/failure */ +STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { - xfs_inobt_block_t *block; /* btree block record/key lives in */ - xfs_buf_t *bp; /* buffer for block */ - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* key value being inserted */ - xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */ - xfs_agblock_t nbno; /* block number of allocated block */ - xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ - xfs_inobt_key_t nkey; /* new key value, from split */ - xfs_inobt_rec_t nrec; /* new record value, for caller */ - int numrecs; - int optr; /* old ptr value */ - xfs_inobt_ptr_t *pp; /* pointer to btree addresses */ - int ptr; /* index in btree block for this rec */ - xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */ + xfs_alloc_arg_t args; /* block allocation args */ + int error; /* error return value */ + xfs_agblock_t sbno = be32_to_cpu(start->s); - /* - * GCC doesn't understand the (arguably complex) control flow in - * this function and complains about uninitialized structure fields - * without this. - */ - memset(&nrec, 0, sizeof(nrec)); + XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY); - /* - * If we made it to the root level, allocate a new root block - * and we're done. - */ - if (level >= cur->bc_nlevels) { - error = xfs_inobt_newroot(cur, &i); - *bnop = NULLAGBLOCK; - *stat = i; + memset(&args, 0, sizeof(args)); + args.tp = cur->bc_tp; + args.mp = cur->bc_mp; + args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_NEAR_BNO; + + error = xfs_alloc_vextent(&args); + if (error) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR); return error; } - /* - * Make a key out of the record data to be inserted, and save it. - */ - key.ir_startino = recp->ir_startino; /* INT_: direct copy */ - optr = ptr = cur->bc_ptrs[level]; - /* - * If we're off the left edge, return failure. - */ - if (ptr == 0) { + if (args.fsbno == NULLFSBLOCK) { + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 0; return 0; } - /* - * Get pointers to the btree buffer and block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); - numrecs = be16_to_cpu(block->bb_numrecs); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; - /* - * Check that the new entry is being inserted in the right place. - */ - if (ptr <= numrecs) { - if (level == 0) { - rp = XFS_INOBT_REC_ADDR(block, ptr, cur); - xfs_btree_check_rec(cur->bc_btnum, recp, rp); - } else { - kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); - xfs_btree_check_key(cur->bc_btnum, &key, kp); - } - } -#endif - nbno = NULLAGBLOCK; - ncur = (xfs_btree_cur_t *)0; - /* - * If the block is full, we can't insert the new entry until we - * make the block un-full. - */ - if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - /* - * First, try shifting an entry to the right neighbor. - */ - if ((error = xfs_inobt_rshift(cur, level, &i))) - return error; - if (i) { - /* nothing */ - } - /* - * Next, try shifting an entry to the left neighbor. - */ - else { - if ((error = xfs_inobt_lshift(cur, level, &i))) - return error; - if (i) { - optr = ptr = cur->bc_ptrs[level]; - } else { - /* - * Next, try splitting the current block - * in half. If this works we have to - * re-set our variables because - * we could be in a different block now. - */ - if ((error = xfs_inobt_split(cur, level, &nbno, - &nkey, &ncur, &i))) - return error; - if (i) { - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, - block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - nrec.ir_startino = nkey.ir_startino; /* INT_: direct copy */ - } else { - /* - * Otherwise the insert fails. - */ - *stat = 0; - return 0; - } - } - } - } - /* - * At this point we know there's room for our new entry in the block - * we're pointing at. - */ - numrecs = be16_to_cpu(block->bb_numrecs); - if (level > 0) { - /* - * It's a non-leaf entry. Make a hole for the new data - * in the key and ptr regions of the block. - */ - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); -#ifdef DEBUG - for (i = numrecs; i >= ptr; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level))) - return error; - } -#endif - memmove(&kp[ptr], &kp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*kp)); - memmove(&pp[ptr], &pp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*pp)); - /* - * Now stuff the new data in, bump numrecs and log the new data. - */ -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, *bnop, level))) - return error; -#endif - kp[ptr - 1] = key; /* INT_: struct copy */ - pp[ptr - 1] = cpu_to_be32(*bnop); - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_keys(cur, bp, ptr, numrecs); - xfs_inobt_log_ptrs(cur, bp, ptr, numrecs); - } else { - /* - * It's a leaf entry. Make a hole for the new record. - */ - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - memmove(&rp[ptr], &rp[ptr - 1], - (numrecs - ptr + 1) * sizeof(*rp)); - /* - * Now stuff the new record in, bump numrecs - * and log the new data. - */ - rp[ptr - 1] = *recp; /* INT_: struct copy */ - numrecs++; - block->bb_numrecs = cpu_to_be16(numrecs); - xfs_inobt_log_recs(cur, bp, ptr, numrecs); - } - /* - * Log the new number of records in the btree header. - */ - xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); -#ifdef DEBUG - /* - * Check that the key/record is in the right place, now. - */ - if (ptr < numrecs) { - if (level == 0) - xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, - rp + ptr); - else - xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1, - kp + ptr); - } -#endif - /* - * If we inserted at the start of a block, update the parents' keys. - */ - if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1))) - return error; - /* - * Return the new block number, if any. - * If there is one, give back a record value and a cursor too. - */ - *bnop = nbno; - if (nbno != NULLAGBLOCK) { - *recp = nrec; /* INT_: struct copy */ - *curp = ncur; - } + ASSERT(args.len == 1); + XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); + + new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno)); *stat = 1; return 0; } -/* - * Log header fields from a btree block. - */ -STATIC void -xfs_inobt_log_block( - xfs_trans_t *tp, /* transaction pointer */ - xfs_buf_t *bp, /* buffer containing btree block */ - int fields) /* mask of fields: XFS_BB_... */ +STATIC int +xfs_inobt_free_block( + struct xfs_btree_cur *cur, + struct xfs_buf *bp) { - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - static const short offsets[] = { /* table of offsets */ - offsetof(xfs_inobt_block_t, bb_magic), - offsetof(xfs_inobt_block_t, bb_level), - offsetof(xfs_inobt_block_t, bb_numrecs), - offsetof(xfs_inobt_block_t, bb_leftsib), - offsetof(xfs_inobt_block_t, bb_rightsib), - sizeof(xfs_inobt_block_t) - }; + xfs_fsblock_t fsbno; + int error; + + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)); + error = xfs_free_extent(cur->bc_tp, fsbno, 1); + if (error) + return error; - xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last); - xfs_trans_log_buf(tp, bp, first, last); + xfs_trans_binval(cur->bc_tp, bp); + return error; } -/* - * Log keys from a btree block (nonleaf). - */ -STATIC void -xfs_inobt_log_keys( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int kfirst, /* index of first key to log */ - int klast) /* index of last key to log */ +STATIC int +xfs_inobt_get_maxrecs( + struct xfs_btree_cur *cur, + int level) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - xfs_inobt_key_t *kp; /* key pointer in btree block */ - int last; /* last byte offset logged */ - - block = XFS_BUF_TO_INOBT_BLOCK(bp); - kp = XFS_INOBT_KEY_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + return cur->bc_mp->m_inobt_mxr[level != 0]; } -/* - * Log block pointer fields from a btree block (nonleaf). - */ STATIC void -xfs_inobt_log_ptrs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int pfirst, /* index of first pointer to log */ - int plast) /* index of last pointer to log */ +xfs_inobt_init_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */ - - block = XFS_BUF_TO_INOBT_BLOCK(bp); - pp = XFS_INOBT_PTR_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + key->inobt.ir_startino = rec->inobt.ir_startino; } -/* - * Log records from a btree block (leaf). - */ STATIC void -xfs_inobt_log_recs( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_buf_t *bp, /* buffer containing btree block */ - int rfirst, /* index of first record to log */ - int rlast) /* index of last record to log */ +xfs_inobt_init_rec_from_key( + union xfs_btree_key *key, + union xfs_btree_rec *rec) { - xfs_inobt_block_t *block; /* btree block to log from */ - int first; /* first byte offset logged */ - int last; /* last byte offset logged */ - xfs_inobt_rec_t *rp; /* record pointer for btree block */ - - block = XFS_BUF_TO_INOBT_BLOCK(bp); - rp = XFS_INOBT_REC_ADDR(block, 1, cur); - first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block); - last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block); - xfs_trans_log_buf(cur->bc_tp, bp, first, last); + rec->inobt.ir_startino = key->inobt.ir_startino; } -/* - * Lookup the record. The cursor is made to point to it, based on dir. - * Return 0 if can't find any such record, 1 for success. - */ -STATIC int /* error */ -xfs_inobt_lookup( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_lookup_t dir, /* <=, ==, or >= */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_init_rec_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec) { - xfs_agblock_t agbno; /* a.g. relative btree block number */ - xfs_agnumber_t agno; /* allocation group number */ - xfs_inobt_block_t *block=NULL; /* current btree block */ - __int64_t diff; /* difference for the current key */ - int error; /* error return value */ - int keyno=0; /* current key number */ - int level; /* level in the btree */ - xfs_mount_t *mp; /* file system mount point */ - - /* - * Get the allocation group header, and the root block number. - */ - mp = cur->bc_mp; - { - xfs_agi_t *agi; /* a.g. inode header */ - - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); - agno = be32_to_cpu(agi->agi_seqno); - agbno = be32_to_cpu(agi->agi_root); - } - /* - * Iterate over each level in the btree, starting at the root. - * For each level above the leaves, find the key we need, based - * on the lookup record, then follow the corresponding block - * pointer down to the next level. - */ - for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) { - xfs_buf_t *bp; /* buffer pointer for btree block */ - xfs_daddr_t d; /* disk address of btree block */ - - /* - * Get the disk address we're looking for. - */ - d = XFS_AGB_TO_DADDR(mp, agno, agbno); - /* - * If the old buffer at this level is for a different block, - * throw it away, otherwise just use it. - */ - bp = cur->bc_bufs[level]; - if (bp && XFS_BUF_ADDR(bp) != d) - bp = (xfs_buf_t *)0; - if (!bp) { - /* - * Need to get a new buffer. Read it, then - * set it in the cursor, releasing the old one. - */ - if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, - agno, agbno, 0, &bp, XFS_INO_BTREE_REF))) - return error; - xfs_btree_setbuf(cur, level, bp); - /* - * Point to the btree block, now that we have the buffer - */ - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, level, - bp))) - return error; - } else - block = XFS_BUF_TO_INOBT_BLOCK(bp); - /* - * If we already had a key match at a higher level, we know - * we need to use the first entry in this block. - */ - if (diff == 0) - keyno = 1; - /* - * Otherwise we need to search this block. Do a binary search. - */ - else { - int high; /* high entry number */ - xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */ - xfs_inobt_rec_t *krbase=NULL;/* base of records in block */ - int low; /* low entry number */ - - /* - * Get a pointer to keys or records. - */ - if (level > 0) - kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur); - else - krbase = XFS_INOBT_REC_ADDR(block, 1, cur); - /* - * Set low and high entry numbers, 1-based. - */ - low = 1; - if (!(high = be16_to_cpu(block->bb_numrecs))) { - /* - * If the block is empty, the tree must - * be an empty leaf. - */ - ASSERT(level == 0 && cur->bc_nlevels == 1); - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; - *stat = 0; - return 0; - } - /* - * Binary search the block. - */ - while (low <= high) { - xfs_agino_t startino; /* key value */ - - /* - * keyno is average of low and high. - */ - keyno = (low + high) >> 1; - /* - * Get startino. - */ - if (level > 0) { - xfs_inobt_key_t *kkp; - - kkp = kkbase + keyno - 1; - startino = INT_GET(kkp->ir_startino, ARCH_CONVERT); - } else { - xfs_inobt_rec_t *krp; - - krp = krbase + keyno - 1; - startino = INT_GET(krp->ir_startino, ARCH_CONVERT); - } - /* - * Compute difference to get next direction. - */ - diff = (__int64_t) - startino - cur->bc_rec.i.ir_startino; - /* - * Less than, move right. - */ - if (diff < 0) - low = keyno + 1; - /* - * Greater than, move left. - */ - else if (diff > 0) - high = keyno - 1; - /* - * Equal, we're done. - */ - else - break; - } - } - /* - * If there are more levels, set up for the next level - * by getting the block number and filling in the cursor. - */ - if (level > 0) { - /* - * If we moved left, need the previous key number, - * unless there isn't one. - */ - if (diff > 0 && --keyno < 1) - keyno = 1; - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, agbno, level))) - return error; -#endif - cur->bc_ptrs[level] = keyno; - } - } - /* - * Done with the search. - * See if we need to adjust the results. - */ - if (dir != XFS_LOOKUP_LE && diff < 0) { - keyno++; - /* - * If ge search and we went off the end of the block, but it's - * not the last block, we're in the wrong block. - */ - if (dir == XFS_LOOKUP_GE && - keyno > be16_to_cpu(block->bb_numrecs) && - be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - int i; - - cur->bc_ptrs[0] = keyno; - if ((error = xfs_inobt_increment(cur, 0, &i))) - return error; - ASSERT(i == 1); - *stat = 1; - return 0; - } - } - else if (dir == XFS_LOOKUP_LE && diff > 0) - keyno--; - cur->bc_ptrs[0] = keyno; - /* - * Return if we succeeded or not. - */ - if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) - *stat = 0; - else - *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0)); - return 0; + rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); + rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } /* - * Move 1 record left from cur/level if possible. - * Update cur to reflect the new path. + * initial value of ptr for lookup */ -STATIC int /* error */ -xfs_inobt_lshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC void +xfs_inobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) { - int error; /* error return value */ -#ifdef DEBUG - int i; /* loop index */ -#endif - xfs_inobt_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left neighbor block */ - xfs_inobt_block_t *left; /* left neighbor btree block */ - xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */ - xfs_inobt_ptr_t *lpp; /* address pointer for left block */ - xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */ - int nrec; /* new number of left block entries */ - xfs_buf_t *rbp; /* buffer for right (current) block */ - xfs_inobt_block_t *right; /* right (current) btree block */ - xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */ - xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */ - xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); - /* - * Set up variables for this block as "right". - */ - rbp = cur->bc_bufs[level]; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; -#endif - /* - * If we've got no left sibling then we can't shift an entry left. - */ - if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] <= 1) { - *stat = 0; - return 0; - } - /* - * Set up the left neighbor as "left". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib), - 0, &lbp, XFS_INO_BTREE_REF))) - return error; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - nrec = be16_to_cpu(left->bb_numrecs) + 1; - /* - * If non-leaf, copy a key and a ptr to the left block. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - *lkp = *rkp; - xfs_inobt_log_keys(cur, lbp, nrec, nrec); - lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level))) - return error; -#endif - *lpp = *rpp; /* INT_: no-change copy */ - xfs_inobt_log_ptrs(cur, lbp, nrec, nrec); - } - /* - * If leaf, copy a record to the left block. - */ - else { - lrp = XFS_INOBT_REC_ADDR(left, nrec, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - *lrp = *rrp; - xfs_inobt_log_recs(cur, lbp, nrec, nrec); - } - /* - * Bump and log left's numrecs, decrement and log right's numrecs. - */ - be16_add(&left->bb_numrecs, 1); - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp); - else - xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp); -#endif - be16_add(&right->bb_numrecs, -1); - xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Slide the contents of right down one entry. - */ - if (level > 0) { -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]), - level))) - return error; - } -#endif - memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - } else { - memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ - rkp = &key; - } - /* - * Update the parent key values of right. - */ - if ((error = xfs_inobt_updkey(cur, rkp, level + 1))) - return error; - /* - * Slide the cursor value left one. - */ - cur->bc_ptrs[level]--; - *stat = 1; - return 0; + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + + ptr->s = agi->agi_root; } -/* - * Allocate a new root block, fill it in. - */ -STATIC int /* error */ -xfs_inobt_newroot( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +STATIC void +xfs_finobt_init_ptr_from_cur( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr) { - xfs_agi_t *agi; /* a.g. inode header */ - xfs_alloc_arg_t args; /* allocation argument structure */ - xfs_inobt_block_t *block; /* one half of the old root block */ - xfs_buf_t *bp; /* buffer containing block */ - int error; /* error return value */ - xfs_inobt_key_t *kp; /* btree key pointer */ - xfs_agblock_t lbno; /* left block number */ - xfs_buf_t *lbp; /* left buffer pointer */ - xfs_inobt_block_t *left; /* left btree block */ - xfs_buf_t *nbp; /* new (root) buffer */ - xfs_inobt_block_t *new; /* new (root) btree block */ - int nptr; /* new value for key index, 1 or 2 */ - xfs_inobt_ptr_t *pp; /* btree address pointer */ - xfs_agblock_t rbno; /* right block number */ - xfs_buf_t *rbp; /* right buffer pointer */ - xfs_inobt_block_t *right; /* right btree block */ - xfs_inobt_rec_t *rp; /* btree record pointer */ + struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp); - ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp)); - - /* - * Get a block & a buffer. - */ - agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp); - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, - be32_to_cpu(agi->agi_root)); - args.mod = args.minleft = args.alignment = args.total = args.wasdel = - args.isfl = args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) - return error; - /* - * None available, we fail. - */ - if (args.fsbno == NULLFSBLOCK) { - *stat = 0; - return 0; - } - ASSERT(args.len == 1); - nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); - new = XFS_BUF_TO_INOBT_BLOCK(nbp); - /* - * Set the root data in the a.g. inode structure. - */ - agi->agi_root = cpu_to_be32(args.agbno); - be32_add(&agi->agi_level, 1); - xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp, - XFS_AGI_ROOT | XFS_AGI_LEVEL); - /* - * At the previous root level there are now two blocks: the old - * root, and the new block generated when it was split. - * We don't know which one the cursor is pointing at, so we - * set up variables "left" and "right" for each case. - */ - bp = cur->bc_bufs[cur->bc_nlevels - 1]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp))) - return error; -#endif - if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) { - /* - * Our block is left, pick up the right block. - */ - lbp = bp; - lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); - left = block; - rbno = be32_to_cpu(left->bb_rightsib); - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - rbno, 0, &rbp, XFS_INO_BTREE_REF))) - return error; - bp = rbp; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, - cur->bc_nlevels - 1, rbp))) - return error; - nptr = 1; - } else { - /* - * Our block is right, pick up the left block. - */ - rbp = bp; - rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp)); - right = block; - lbno = be32_to_cpu(right->bb_leftsib); - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - lbno, 0, &lbp, XFS_INO_BTREE_REF))) - return error; - bp = lbp; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); - if ((error = xfs_btree_check_sblock(cur, left, - cur->bc_nlevels - 1, lbp))) - return error; - nptr = 2; - } - /* - * Fill in the new block's btree header and log it. - */ - new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - new->bb_level = cpu_to_be16(cur->bc_nlevels); - new->bb_numrecs = cpu_to_be16(2); - new->bb_leftsib = cpu_to_be32(NULLAGBLOCK); - new->bb_rightsib = cpu_to_be32(NULLAGBLOCK); - xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS); - ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK); - /* - * Fill in the key data in the new root. - */ - kp = XFS_INOBT_KEY_ADDR(new, 1, cur); - if (be16_to_cpu(left->bb_level) > 0) { - kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur); /* INT_: struct copy */ - kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur); /* INT_: struct copy */ - } else { - rp = XFS_INOBT_REC_ADDR(left, 1, cur); - INT_COPY(kp[0].ir_startino, rp->ir_startino, ARCH_CONVERT); - rp = XFS_INOBT_REC_ADDR(right, 1, cur); - INT_COPY(kp[1].ir_startino, rp->ir_startino, ARCH_CONVERT); - } - xfs_inobt_log_keys(cur, nbp, 1, 2); - /* - * Fill in the pointer data in the new root. - */ - pp = XFS_INOBT_PTR_ADDR(new, 1, cur); - pp[0] = cpu_to_be32(lbno); - pp[1] = cpu_to_be32(rbno); - xfs_inobt_log_ptrs(cur, nbp, 1, 2); - /* - * Fix up the cursor. - */ - xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; - cur->bc_nlevels++; - *stat = 1; - return 0; + ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno)); + ptr->s = agi->agi_free_root; } -/* - * Move 1 record right from cur/level if possible. - * Update cur to reflect the new path. - */ -STATIC int /* error */ -xfs_inobt_rshift( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to shift record on */ - int *stat) /* success/failure */ +STATIC __int64_t +xfs_inobt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) { - int error; /* error return value */ - int i; /* loop index */ - xfs_inobt_key_t key; /* key value for leaf level upward */ - xfs_buf_t *lbp; /* buffer for left (current) block */ - xfs_inobt_block_t *left; /* left (current) btree block */ - xfs_inobt_key_t *lkp; /* key pointer for left block */ - xfs_inobt_ptr_t *lpp; /* address pointer for left block */ - xfs_inobt_rec_t *lrp; /* record pointer for left block */ - xfs_buf_t *rbp; /* buffer for right neighbor block */ - xfs_inobt_block_t *right; /* right neighbor btree block */ - xfs_inobt_key_t *rkp; /* key pointer for right block */ - xfs_inobt_ptr_t *rpp; /* address pointer for right block */ - xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */ - xfs_btree_cur_t *tcur; /* temporary cursor */ - - /* - * Set up variables for this block as "left". - */ - lbp = cur->bc_bufs[level]; - left = XFS_BUF_TO_INOBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * If we've got no right sibling then we can't shift an entry right. - */ - if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * If the cursor entry is the one that would be moved, don't - * do it... it's too complicated. - */ - if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) { - *stat = 0; - return 0; - } - /* - * Set up the right neighbor as "right". - */ - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), - 0, &rbp, XFS_INO_BTREE_REF))) - return error; - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) - return error; - /* - * If it's full, it can't take another entry. - */ - if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { - *stat = 0; - return 0; - } - /* - * Make a hole at the start of the right neighbor block, then - * copy the last left block entry to the hole. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level))) - return error; - } -#endif - memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); -#ifdef DEBUG - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level))) - return error; -#endif - *rkp = *lkp; /* INT_: no change copy */ - *rpp = *lpp; /* INT_: no change copy */ - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - } else { - lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - *rrp = *lrp; - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1); - key.ir_startino = rrp->ir_startino; /* INT_: direct copy */ - rkp = &key; - } - /* - * Decrement and log left's numrecs, bump and log right's numrecs. - */ - be16_add(&left->bb_numrecs, -1); - xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS); - be16_add(&right->bb_numrecs, 1); -#ifdef DEBUG - if (level > 0) - xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1); - else - xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1); -#endif - xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS); - /* - * Using a temporary cursor, update the parent key values of the - * block on the right. - */ - if ((error = xfs_btree_dup_cursor(cur, &tcur))) - return error; - xfs_btree_lastrec(tcur, level); - if ((error = xfs_inobt_increment(tcur, level, &i)) || - (error = xfs_inobt_updkey(tcur, rkp, level + 1))) { - xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); - return error; - } - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - *stat = 1; - return 0; + return (__int64_t)be32_to_cpu(key->inobt.ir_startino) - + cur->bc_rec.i.ir_startino; } -/* - * Split cur/level block in half. - * Return new block number and its first record (to be inserted into parent). - */ -STATIC int /* error */ -xfs_inobt_split( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level to split */ - xfs_agblock_t *bnop, /* output: block number allocated */ - xfs_inobt_key_t *keyp, /* output: first key of new block */ - xfs_btree_cur_t **curp, /* output: new cursor */ - int *stat) /* success/failure */ +static int +xfs_inobt_verify( + struct xfs_buf *bp) { - xfs_alloc_arg_t args; /* allocation argument structure */ - int error; /* error return value */ - int i; /* loop index/record number */ - xfs_agblock_t lbno; /* left (current) block number */ - xfs_buf_t *lbp; /* buffer for left block */ - xfs_inobt_block_t *left; /* left (current) btree block */ - xfs_inobt_key_t *lkp; /* left btree key pointer */ - xfs_inobt_ptr_t *lpp; /* left btree address pointer */ - xfs_inobt_rec_t *lrp; /* left btree record pointer */ - xfs_buf_t *rbp; /* buffer for right block */ - xfs_inobt_block_t *right; /* right (new) btree block */ - xfs_inobt_key_t *rkp; /* right btree key pointer */ - xfs_inobt_ptr_t *rpp; /* right btree address pointer */ - xfs_inobt_rec_t *rrp; /* right btree record pointer */ - - /* - * Set up left block (current one). - */ - lbp = cur->bc_bufs[level]; - args.tp = cur->bc_tp; - args.mp = cur->bc_mp; - lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp)); - /* - * Allocate the new block. - * If we can't do it, we're toast. Give up. - */ - args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno); - args.mod = args.minleft = args.alignment = args.total = args.wasdel = - args.isfl = args.userdata = args.minalignslop = 0; - args.minlen = args.maxlen = args.prod = 1; - args.type = XFS_ALLOCTYPE_NEAR_BNO; - if ((error = xfs_alloc_vextent(&args))) - return error; - if (args.fsbno == NULLFSBLOCK) { - *stat = 0; + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + unsigned int level; + + /* + * During growfs operations, we can't verify the exact owner as the + * perag is not fully initialised and hence not attached to the buffer. + * + * Similarly, during log recovery we will have a perag structure + * attached, but the agi information will not yet have been initialised + * from the on disk AGI. We don't currently use any of this information, + * but beware of the landmine (i.e. need to check pag->pagi_init) if we + * ever do. + */ + switch (block->bb_magic) { + case cpu_to_be32(XFS_IBT_CRC_MAGIC): + case cpu_to_be32(XFS_FIBT_CRC_MAGIC): + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && + be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + /* fall through */ + case cpu_to_be32(XFS_IBT_MAGIC): + case cpu_to_be32(XFS_FIBT_MAGIC): + break; + default: return 0; } - ASSERT(args.len == 1); - rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0); - /* - * Set up the new block as "right". - */ - right = XFS_BUF_TO_INOBT_BLOCK(rbp); - /* - * "Left" is the current (according to the cursor) block. - */ - left = XFS_BUF_TO_INOBT_BLOCK(lbp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) - return error; -#endif - /* - * Fill in the btree header for the new block. - */ - right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]); - right->bb_level = left->bb_level; - right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2); - /* - * Make sure that if there's an odd number of entries now, that - * each new block will have the same number of entries. - */ - if ((be16_to_cpu(left->bb_numrecs) & 1) && - cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1) - be16_add(&right->bb_numrecs, 1); - i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1; - /* - * For non-leaf blocks, copy keys and addresses over to the new block. - */ - if (level > 0) { - lkp = XFS_INOBT_KEY_ADDR(left, i, cur); - lpp = XFS_INOBT_PTR_ADDR(left, i, cur); - rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); - rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); -#ifdef DEBUG - for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) { - if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level))) - return error; - } -#endif - memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp)); - memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp)); - xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - *keyp = *rkp; - } - /* - * For leaf blocks, copy records over to the new block. - */ - else { - lrp = XFS_INOBT_REC_ADDR(left, i, cur); - rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp)); - xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs)); - keyp->ir_startino = rrp->ir_startino; /* INT_: direct copy */ - } - /* - * Find the left block number by looking in the buffer. - * Adjust numrecs, sibling pointers. - */ - be16_add(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs))); - right->bb_rightsib = left->bb_rightsib; - left->bb_rightsib = cpu_to_be32(args.agbno); - right->bb_leftsib = cpu_to_be32(lbno); - xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS); - xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); - /* - * If there's a block to the new block's right, make that block - * point back to right instead of to left. - */ - if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) { - xfs_inobt_block_t *rrblock; /* rr btree block */ - xfs_buf_t *rrbp; /* buffer for rrblock */ - if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno, - be32_to_cpu(right->bb_rightsib), 0, &rrbp, - XFS_INO_BTREE_REF))) - return error; - rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); - if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) - return error; - rrblock->bb_leftsib = cpu_to_be32(args.agbno); - xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB); - } - /* - * If the cursor is really in the right block, move it there. - * If it's just pointing past the last entry in left, then we'll - * insert there, so don't change anything in that case. - */ - if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) { - xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs); - } - /* - * If there are more levels, we'll need another cursor which refers - * the right block, no matter where this cursor was. - */ - if (level + 1 < cur->bc_nlevels) { - if ((error = xfs_btree_dup_cursor(cur, curp))) - return error; - (*curp)->bc_ptrs[level + 1]++; - } - *bnop = args.agbno; - *stat = 1; - return 0; + /* numrecs and level verification */ + level = be16_to_cpu(block->bb_level); + if (level >= mp->m_in_maxlevels) + return false; + if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; } -/* - * Update keys at all levels from here to the root along the cursor's path. - */ -STATIC int /* error */ -xfs_inobt_updkey( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_inobt_key_t *keyp, /* new key value to update to */ - int level) /* starting level for update */ +static void +xfs_inobt_read_verify( + struct xfs_buf *bp) { - int ptr; /* index of key in block */ - - /* - * Go up the tree from this level toward the root. - * At each level, update the key value to the value input. - * Stop when we reach a level where the cursor isn't pointing - * at the first entry in the block. - */ - for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) { - xfs_buf_t *bp; /* buffer for block */ - xfs_inobt_block_t *block; /* btree block */ -#ifdef DEBUG - int error; /* error return value */ -#endif - xfs_inobt_key_t *kp; /* ptr to btree block keys */ + if (!xfs_btree_sblock_verify_crc(bp)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_inobt_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - ptr = cur->bc_ptrs[level]; - kp = XFS_INOBT_KEY_ADDR(block, ptr, cur); - *kp = *keyp; - xfs_inobt_log_keys(cur, bp, ptr, ptr); + if (bp->b_error) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_verifier_error(bp); } - return 0; } -/* - * Externally visible routines. - */ - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -int /* error */ -xfs_inobt_decrement( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +static void +xfs_inobt_write_verify( + struct xfs_buf *bp) { - xfs_inobt_block_t *block; /* btree block */ - int error; - int lev; /* btree level */ - - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the left at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); - /* - * Decrement the ptr at this level. If we're still in the block - * then we're done. - */ - if (--cur->bc_ptrs[level] > 0) { - *stat = 1; - return 0; - } - /* - * Get a pointer to the btree block. - */ - block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, - cur->bc_bufs[level]))) - return error; -#endif - /* - * If we just went off the left edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree decrementing pointers. - * Stop when we don't go off the left edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) - break; - /* - * Read-ahead the left block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); + if (!xfs_inobt_verify(bp)) { + trace_xfs_btree_corrupt(bp, _RET_IP_); + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ + xfs_btree_sblock_calc_crc(bp); - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, - XFS_INO_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs); - } - *stat = 1; - return 0; } -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -int /* error */ -xfs_inobt_delete( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ +const struct xfs_buf_ops xfs_inobt_buf_ops = { + .verify_read = xfs_inobt_read_verify, + .verify_write = xfs_inobt_write_verify, +}; + +#if defined(DEBUG) || defined(XFS_WARN) +STATIC int +xfs_inobt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) { - int error; - int i; /* result code */ - int level; /* btree level */ - - /* - * Go up the tree, starting at leaf level. - * If 2 is returned then a join was done; go to the next level. - * Otherwise we are done. - */ - for (level = 0, i = 2; i == 2; level++) { - if ((error = xfs_inobt_delrec(cur, level, &i))) - return error; - } - if (i == 0) { - for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { - if ((error = xfs_inobt_decrement(cur, level, &i))) - return error; - break; - } - } - } - *stat = i; - return 0; + return be32_to_cpu(k1->inobt.ir_startino) < + be32_to_cpu(k2->inobt.ir_startino); } - -/* - * Get the data from the pointed-to record. - */ -int /* error */ -xfs_inobt_get_rec( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t *ino, /* output: starting inode of chunk */ - __int32_t *fcnt, /* output: number of free inodes */ - xfs_inofree_t *free, /* output: free inode mask */ - int *stat) /* output: success/failure */ +STATIC int +xfs_inobt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) { - xfs_inobt_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ -#ifdef DEBUG - int error; /* error return value */ + return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= + be32_to_cpu(r2->inobt.ir_startino); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_inobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_inobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, #endif - int ptr; /* record number */ - xfs_inobt_rec_t *rec; /* record data */ - - bp = cur->bc_bufs[0]; - ptr = cur->bc_ptrs[0]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) - return error; +}; + +static const struct xfs_btree_ops xfs_finobt_ops = { + .rec_len = sizeof(xfs_inobt_rec_t), + .key_len = sizeof(xfs_inobt_key_t), + + .dup_cursor = xfs_inobt_dup_cursor, + .set_root = xfs_finobt_set_root, + .alloc_block = xfs_inobt_alloc_block, + .free_block = xfs_inobt_free_block, + .get_minrecs = xfs_inobt_get_minrecs, + .get_maxrecs = xfs_inobt_get_maxrecs, + .init_key_from_rec = xfs_inobt_init_key_from_rec, + .init_rec_from_key = xfs_inobt_init_rec_from_key, + .init_rec_from_cur = xfs_inobt_init_rec_from_cur, + .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur, + .key_diff = xfs_inobt_key_diff, + .buf_ops = &xfs_inobt_buf_ops, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_inobt_keys_inorder, + .recs_inorder = xfs_inobt_recs_inorder, #endif - /* - * Off the right end or left end, return failure. - */ - if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) { - *stat = 0; - return 0; - } - /* - * Point to the record and extract its data. - */ - rec = XFS_INOBT_REC_ADDR(block, ptr, cur); - *ino = INT_GET(rec->ir_startino, ARCH_CONVERT); - *fcnt = INT_GET(rec->ir_freecount, ARCH_CONVERT); - *free = INT_GET(rec->ir_free, ARCH_CONVERT); - *stat = 1; - return 0; -} +}; /* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. + * Allocate a new inode btree cursor. */ -int /* error */ -xfs_inobt_increment( - xfs_btree_cur_t *cur, /* btree cursor */ - int level, /* level in btree, 0 is leaf */ - int *stat) /* success/failure */ +struct xfs_btree_cur * /* new inode btree cursor */ +xfs_inobt_init_cursor( + struct xfs_mount *mp, /* file system mount point */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_buf *agbp, /* buffer for agi structure */ + xfs_agnumber_t agno, /* allocation group number */ + xfs_btnum_t btnum) /* ialloc or free ino btree */ { - xfs_inobt_block_t *block; /* btree block */ - xfs_buf_t *bp; /* buffer containing btree block */ - int error; /* error return value */ - int lev; /* btree level */ + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + struct xfs_btree_cur *cur; - ASSERT(level < cur->bc_nlevels); - /* - * Read-ahead to the right at this level. - */ - xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA); - /* - * Get a pointer to the btree block. - */ - bp = cur->bc_bufs[level]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, level, bp))) - return error; -#endif - /* - * Increment the ptr at this level. If we're still in the block - * then we're done. - */ - if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) { - *stat = 1; - return 0; - } - /* - * If we just went off the right edge of the tree, return failure. - */ - if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) { - *stat = 0; - return 0; - } - /* - * March up the tree incrementing pointers. - * Stop when we don't go off the right edge of a block. - */ - for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - bp = cur->bc_bufs[lev]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; -#endif - if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs)) - break; - /* - * Read-ahead the right block, we're going to read it - * in the next loop. - */ - xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA); - } - /* - * If we went off the root then we are seriously confused. - */ - ASSERT(lev < cur->bc_nlevels); - /* - * Now walk back down the tree, fixing up the cursor's buffer - * pointers and key numbers. - */ - for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp); - lev > level; ) { - xfs_agblock_t agbno; /* block number of btree block */ + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); - agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur)); - if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, - cur->bc_private.i.agno, agbno, 0, &bp, - XFS_INO_BTREE_REF))) - return error; - lev--; - xfs_btree_setbuf(cur, lev, bp); - block = XFS_BUF_TO_INOBT_BLOCK(bp); - if ((error = xfs_btree_check_sblock(cur, block, lev, bp))) - return error; - cur->bc_ptrs[lev] = 1; + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + if (btnum == XFS_BTNUM_INO) { + cur->bc_nlevels = be32_to_cpu(agi->agi_level); + cur->bc_ops = &xfs_inobt_ops; + } else { + cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); + cur->bc_ops = &xfs_finobt_ops; } - *stat = 1; - return 0; -} - -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -int /* error */ -xfs_inobt_insert( - xfs_btree_cur_t *cur, /* btree cursor */ - int *stat) /* success/failure */ -{ - int error; /* error return value */ - int i; /* result value, 0 for failure */ - int level; /* current level number in btree */ - xfs_agblock_t nbno; /* new block number (split result) */ - xfs_btree_cur_t *ncur; /* new cursor (split result) */ - xfs_inobt_rec_t nrec; /* record being inserted this level */ - xfs_btree_cur_t *pcur; /* previous level's cursor */ - level = 0; - nbno = NULLAGBLOCK; - INT_SET(nrec.ir_startino, ARCH_CONVERT, cur->bc_rec.i.ir_startino); - INT_SET(nrec.ir_freecount, ARCH_CONVERT, cur->bc_rec.i.ir_freecount); - INT_SET(nrec.ir_free, ARCH_CONVERT, cur->bc_rec.i.ir_free); - ncur = (xfs_btree_cur_t *)0; - pcur = cur; - /* - * Loop going up the tree, starting at the leaf level. - * Stop when we don't get a split block, that must mean that - * the insert is finished with this level. - */ - do { - /* - * Insert nrec/nbno into this level of the tree. - * Note if we fail, nbno will be null. - */ - if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur, - &i))) { - if (pcur != cur) - xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); - return error; - } - /* - * See if the cursor we just used is trash. - * Can't trash the caller's cursor, but otherwise we should - * if ncur is a new cursor or we're about to be done. - */ - if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) { - cur->bc_nlevels = pcur->bc_nlevels; - xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); - } - /* - * If we got a new cursor, switch to it. - */ - if (ncur) { - pcur = ncur; - ncur = (xfs_btree_cur_t *)0; - } - } while (nbno != NULLAGBLOCK); - *stat = i; - return 0; -} + cur->bc_blocklog = mp->m_sb.sb_blocklog; -/* - * Lookup the record equal to ino in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_eq( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat); -} + if (xfs_sb_version_hascrc(&mp->m_sb)) + cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_ge( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat); -} + cur->bc_private.a.agbp = agbp; + cur->bc_private.a.agno = agno; -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -int /* error */ -xfs_inobt_lookup_le( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free, /* free inode mask */ - int *stat) /* success/failure */ -{ - cur->bc_rec.i.ir_startino = ino; - cur->bc_rec.i.ir_freecount = fcnt; - cur->bc_rec.i.ir_free = free; - return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat); + return cur; } /* - * Update the record referred to by cur, to the value given - * by [ino, fcnt, free]. - * This either works (return 0) or gets an EFSCORRUPTED error. + * Calculate number of records in an inobt btree block. */ -int /* error */ -xfs_inobt_update( - xfs_btree_cur_t *cur, /* btree cursor */ - xfs_agino_t ino, /* starting inode of chunk */ - __int32_t fcnt, /* free inode count */ - xfs_inofree_t free) /* free inode mask */ +int +xfs_inobt_maxrecs( + struct xfs_mount *mp, + int blocklen, + int leaf) { - xfs_inobt_block_t *block; /* btree block to update */ - xfs_buf_t *bp; /* buffer containing btree block */ - int error; /* error return value */ - int ptr; /* current record number (updating) */ - xfs_inobt_rec_t *rp; /* pointer to updated record */ + blocklen -= XFS_INOBT_BLOCK_LEN(mp); - /* - * Pick up the current block. - */ - bp = cur->bc_bufs[0]; - block = XFS_BUF_TO_INOBT_BLOCK(bp); -#ifdef DEBUG - if ((error = xfs_btree_check_sblock(cur, block, 0, bp))) - return error; -#endif - /* - * Get the address of the rec to be updated. - */ - ptr = cur->bc_ptrs[0]; - rp = XFS_INOBT_REC_ADDR(block, ptr, cur); - /* - * Fill in the new contents and log them. - */ - INT_SET(rp->ir_startino, ARCH_CONVERT, ino); - INT_SET(rp->ir_freecount, ARCH_CONVERT, fcnt); - INT_SET(rp->ir_free, ARCH_CONVERT, free); - xfs_inobt_log_recs(cur, bp, ptr, ptr); - /* - * Updating first record in leaf. Pass new key value up to our parent. - */ - if (ptr == 1) { - xfs_inobt_key_t key; /* key containing [ino] */ - - INT_SET(key.ir_startino, ARCH_CONVERT, ino); - if ((error = xfs_inobt_updkey(cur, &key, 1))) - return error; - } - return 0; + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h index ae3904cb1ee..d7ebea72c2d 100644 --- a/fs/xfs/xfs_ialloc_btree.h +++ b/fs/xfs/xfs_ialloc_btree.h @@ -24,154 +24,42 @@ struct xfs_buf; struct xfs_btree_cur; -struct xfs_btree_sblock; struct xfs_mount; /* - * There is a btree for the inode map per allocation group. + * Btree block header size depends on a superblock flag. */ -#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */ - -typedef __uint64_t xfs_inofree_t; -#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t)) -#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3) -#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) - -#define XFS_INOBT_MASKN(i,n) xfs_inobt_maskn(i,n) -static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) -{ - return (((n) >= XFS_INODES_PER_CHUNK ? \ - (xfs_inofree_t)0 : ((xfs_inofree_t)1 << (n))) - 1) << (i); -} - -/* - * Data record structure - */ -typedef struct xfs_inobt_rec -{ - xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ - xfs_inofree_t ir_free; /* free inode mask */ -} xfs_inobt_rec_t; - -/* - * Key structure - */ -typedef struct xfs_inobt_key -{ - xfs_agino_t ir_startino; /* starting inode number */ -} xfs_inobt_key_t; - -/* btree pointer type */ -typedef __be32 xfs_inobt_ptr_t; - -/* btree block header type */ -typedef struct xfs_btree_sblock xfs_inobt_block_t; - -#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)XFS_BUF_PTR(bp)) - -/* - * Bit manipulations for ir_free. - */ -#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) -#define XFS_INOBT_IS_FREE(rp,i) \ - (((rp)->ir_free & XFS_INOBT_MASK(i)) != 0) -#define XFS_INOBT_IS_FREE_DISK(rp,i) \ - ((INT_GET((rp)->ir_free,ARCH_CONVERT) & XFS_INOBT_MASK(i)) != 0) -#define XFS_INOBT_SET_FREE(rp,i) ((rp)->ir_free |= XFS_INOBT_MASK(i)) -#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i)) - -/* - * Real block structures have a size equal to the disk block size. - */ -#define XFS_INOBT_BLOCK_SIZE(lev,cur) (1 << (cur)->bc_blocklog) -#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0]) -#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0]) -#define XFS_INOBT_IS_LAST_REC(cur) \ - ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs)) - -/* - * Maximum number of inode btree levels. - */ -#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels) - -/* - * block numbers in the AG. - */ -#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1)) -#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1)) +#define XFS_INOBT_BLOCK_LEN(mp) \ + (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* * Record, key, and pointer address macros for btree blocks. - */ -#define XFS_INOBT_REC_ADDR(bb,i,cur) \ - (XFS_BTREE_REC_ADDR(XFS_INOBT_BLOCK_SIZE(0,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(0, cur))) -#define XFS_INOBT_KEY_ADDR(bb,i,cur) \ - (XFS_BTREE_KEY_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) - -#define XFS_INOBT_PTR_ADDR(bb,i,cur) \ - (XFS_BTREE_PTR_ADDR(XFS_INOBT_BLOCK_SIZE(1,cur), xfs_inobt, bb, \ - i, XFS_INOBT_BLOCK_MAXRECS(1, cur))) - -/* - * Decrement cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Delete the record pointed to by cur. - * The cursor refers to the place where the record was (could be inserted) - * when the operation returns. - */ -extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat); - -/* - * Get the data from the pointed-to record. - */ -extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, - __int32_t *fcnt, xfs_inofree_t *free, int *stat); - -/* - * Increment cursor by one record at the level. - * For nonzero levels the leaf-ward information is untouched. - */ -extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat); - -/* - * Insert the current record at the point referenced by cur. - * The cursor may be inconsistent on return if splits have been done. - */ -extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat); - -/* - * Lookup the record equal to ino in the btree given by cur. - */ -extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record greater than or equal to ino - * in the btree given by cur. - */ -extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Lookup the first record less than or equal to ino - * in the btree given by cur. - */ -extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free, int *stat); - -/* - * Update the record referred to by cur, to the value given - * by [ino, fcnt, free]. - * This either works (return 0) or gets an EFSCORRUPTED error. - */ -extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino, - __int32_t fcnt, xfs_inofree_t free); + * + * (note that some of these may appear unused, but they are used in userspace) + */ +#define XFS_INOBT_REC_ADDR(mp, block, index) \ + ((xfs_inobt_rec_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (((index) - 1) * sizeof(xfs_inobt_rec_t)))) + +#define XFS_INOBT_KEY_ADDR(mp, block, index) \ + ((xfs_inobt_key_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + ((index) - 1) * sizeof(xfs_inobt_key_t))) + +#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \ + ((xfs_inobt_ptr_t *) \ + ((char *)(block) + \ + XFS_INOBT_BLOCK_LEN(mp) + \ + (maxrecs) * sizeof(xfs_inobt_key_t) + \ + ((index) - 1) * sizeof(xfs_inobt_ptr_t))) + +extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, + struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, + xfs_btnum_t); +extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c new file mode 100644 index 00000000000..c48df5f25b9 --- /dev/null +++ b/fs/xfs/xfs_icache.c @@ -0,0 +1,1327 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_inode_item.h" +#include "xfs_quota.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_util.h" + +#include <linux/kthread.h> +#include <linux/freezer.h> + +STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, + struct xfs_perag *pag, struct xfs_inode *ip); + +/* + * Allocate and initialise an xfs_inode. + */ +struct xfs_inode * +xfs_inode_alloc( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + + /* + * if this didn't occur in transactions, we could use + * KM_MAYFAIL and return NULL here on ENOMEM. Set the + * code up to do this anyway. + */ + ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); + if (!ip) + return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } + + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(!xfs_isiflocked(ip)); + ASSERT(ip->i_ino == 0); + + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + /* initialise the xfs inode */ + ip->i_ino = ino; + ip->i_mount = mp; + memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); + ip->i_afp = NULL; + memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); + ip->i_flags = 0; + ip->i_delayed_blks = 0; + memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); + + return ip; +} + +STATIC void +xfs_inode_free_callback( + struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct xfs_inode *ip = XFS_I(inode); + + kmem_zone_free(xfs_inode_zone, ip); +} + +void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + if (ip->i_itemp) { + ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* + * Because we use RCU freeing we need to ensure the inode always + * appears to be reclaimed with an invalid inode number when in the + * free state. The ip->i_flags_lock provides the barrier against lookup + * races. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + +/* + * Check the validity of the inode we just found it the cache + */ +static int +xfs_iget_cache_hit( + struct xfs_perag *pag, + struct xfs_inode *ip, + xfs_ino_t ino, + int flags, + int lock_flags) __releases(RCU) +{ + struct inode *inode = VFS_I(ip); + struct xfs_mount *mp = ip->i_mount; + int error; + + /* + * check for re-use of an inode within an RCU grace period due to the + * radix tree nodes not being updated yet. We monitor for this by + * setting the inode number to zero before freeing the inode structure. + * If the inode has been reallocated and set up, then the inode number + * will not match, so check for that, too. + */ + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != ino) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + + /* + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. + */ + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { + trace_xfs_iget_skip(ip); + XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; + goto out_error; + } + + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { + trace_xfs_iget_reclaim(ip); + + /* + * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode + * from stomping over us while we recycle the inode. We can't + * clear the radix tree reclaimable tag yet as it requires + * pag_ici_lock to be held exclusive. + */ + ip->i_flags |= XFS_IRECLAIM; + + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + rcu_read_lock(); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + ASSERT(ip->i_flags & XFS_IRECLAIMABLE); + trace_xfs_iget_reclaim_fail(ip); + goto out_error; + } + + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + /* + * Clear the per-lifetime state in the inode as we are now + * effectively a new inode and need to return to the initial + * state before reuse occurs. + */ + ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; + ip->i_flags |= XFS_INEW; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); + inode->i_state = I_NEW; + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + } else { + /* If the VFS inode is being torn down, pause and try again. */ + if (!igrab(inode)) { + trace_xfs_iget_skip(ip); + error = EAGAIN; + goto out_error; + } + + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + trace_xfs_iget_hit(ip); + } + + if (lock_flags != 0) + xfs_ilock(ip, lock_flags); + + xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); + XFS_STATS_INC(xs_ig_found); + + return 0; + +out_error: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + return error; +} + + +static int +xfs_iget_cache_miss( + struct xfs_mount *mp, + struct xfs_perag *pag, + xfs_trans_t *tp, + xfs_ino_t ino, + struct xfs_inode **ipp, + int flags, + int lock_flags) +{ + struct xfs_inode *ip; + int error; + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); + int iflags; + + ip = xfs_inode_alloc(mp, ino); + if (!ip) + return ENOMEM; + + error = xfs_iread(mp, tp, ip, flags); + if (error) + goto out_destroy; + + trace_xfs_iget_miss(ip); + + if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_destroy; + } + + /* + * Preload the radix tree so we can insert safely under the + * write spinlock. Note that we cannot sleep inside the preload + * region. Since we can be called from transaction context, don't + * recurse into the file system. + */ + if (radix_tree_preload(GFP_NOFS)) { + error = EAGAIN; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); + } + + /* + * These values must be set before inserting the inode into the radix + * tree as the moment it is inserted a concurrent lookup (allowed by the + * RCU locking mechanism) can find it and that lookup must see that this + * is an inode currently under construction (i.e. that XFS_INEW is set). + * The ip->i_flags_lock that protects the XFS_INEW flag forms the + * memory barrier that ensures this detection works correctly at lookup + * time. + */ + iflags = XFS_INEW; + if (flags & XFS_IGET_DONTCACHE) + iflags |= XFS_IDONTCACHE; + ip->i_udquot = NULL; + ip->i_gdquot = NULL; + ip->i_pdquot = NULL; + xfs_iflags_set(ip, iflags); + + /* insert the new inode */ + spin_lock(&pag->pag_ici_lock); + error = radix_tree_insert(&pag->pag_ici_root, agino, ip); + if (unlikely(error)) { + WARN_ON(error != -EEXIST); + XFS_STATS_INC(xs_ig_dup); + error = EAGAIN; + goto out_preload_end; + } + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + + *ipp = ip; + return 0; + +out_preload_end: + spin_unlock(&pag->pag_ici_lock); + radix_tree_preload_end(); + if (lock_flags) + xfs_iunlock(ip, lock_flags); +out_destroy: + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); + return error; +} + +/* + * Look up an inode by number in the given file system. + * The inode is looked up in the cache held in each AG. + * If the inode is found in the cache, initialise the vfs inode + * if necessary. + * + * If it is not in core, read it in from the file system's device, + * add it to the cache and initialise the vfs inode. + * + * The inode is locked according to the value of the lock_flags parameter. + * This flag parameter indicates how and if the inode's IO lock and inode lock + * should be taken. + * + * mp -- the mount point structure for the current file system. It points + * to the inode hash table. + * tp -- a pointer to the current transaction if there is one. This is + * simply passed through to the xfs_iread() call. + * ino -- the number of the inode desired. This is the unique identifier + * within the file system for the inode being requested. + * lock_flags -- flags indicating how to lock the inode. See the comment + * for xfs_ilock() for a list of valid values. + */ +int +xfs_iget( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_ino_t ino, + uint flags, + uint lock_flags, + xfs_inode_t **ipp) +{ + xfs_inode_t *ip; + int error; + xfs_perag_t *pag; + xfs_agino_t agino; + + /* + * xfs_reclaim_inode() uses the ILOCK to ensure an inode + * doesn't get freed while it's being referenced during a + * radix tree traversal here. It assumes this function + * aqcuires only the ILOCK (and therefore it has no need to + * involve the IOLOCK in this synchronization). + */ + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ + if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) + return EINVAL; + + /* get the perag structure and ensure that it's inode capable */ + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); + agino = XFS_INO_TO_AGINO(mp, ino); + +again: + error = 0; + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + + if (ip) { + error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); + if (error) + goto out_error_or_again; + } else { + rcu_read_unlock(); + XFS_STATS_INC(xs_ig_missed); + + error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, + flags, lock_flags); + if (error) + goto out_error_or_again; + } + xfs_perag_put(pag); + + *ipp = ip; + + /* + * If we have a real type for an on-disk inode, we can set ops(&unlock) + * now. If it's a new inode being created, xfs_ialloc will handle it. + */ + if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) + xfs_setup_inode(ip); + return 0; + +out_error_or_again: + if (error == EAGAIN) { + delay(1); + goto again; + } + xfs_perag_put(pag); + return error; +} + +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 + +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(rcu_read_lock_held()); + + /* + * check for stale RCU freed inode + * + * If the inode has been reallocated, it doesn't matter if it's not in + * the AG we are walking - we are walking for writeback, so if it + * passes all the "valid inode" checks and is dirty, then we'll write + * it back anyway. If it has been reallocated and still being + * initialised, the XFS_INEW check below will catch it. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino) + goto out_unlock_noent; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock_noent; + spin_unlock(&ip->i_flags_lock); + + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + /* inode is valid */ + return 0; + +out_unlock_noent: + spin_unlock(&ip->i_flags_lock); + return ENOENT; +} + +STATIC int +xfs_inode_ag_walk( + struct xfs_mount *mp, + struct xfs_perag *pag, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + uint32_t first_index; + int last_error = 0; + int skipped; + int done; + int nr_found; + +restart: + done = 0; + skipped = 0; + first_index = 0; + nr_found = 0; + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int error = 0; + int i; + + rcu_read_lock(); + + if (tag == -1) + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + else + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, tag); + + if (!nr_found) { + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can occur if + * we have inodes in the last block of the AG and we + * are currently pointing to the last inode. + * + * Because we may see inodes that are from the wrong AG + * due to RCU freeing and reallocation, only update the + * index if it lies in this AG. It was a race that lead + * us to see this inode, so another lookup from the + * same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], flags, args); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + /* bail out if the filesystem is corrupted. */ + if (error == EFSCORRUPTED) + break; + + cond_resched(); + + } while (nr_found && !done); + + if (skipped) { + delay(1); + goto restart; + } + return last_error; +} + +/* + * Background scanning to trim post-EOF preallocated space. This is queued + * based on the 'speculative_prealloc_lifetime' tunable (5m by default). + */ +STATIC void +xfs_queue_eofblocks( + struct xfs_mount *mp) +{ + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) + queue_delayed_work(mp->m_eofblocks_workqueue, + &mp->m_eofblocks_work, + msecs_to_jiffies(xfs_eofb_secs * 1000)); + rcu_read_unlock(); +} + +void +xfs_eofblocks_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_eofblocks_work); + xfs_icache_free_eofblocks(mp, NULL); + xfs_queue_eofblocks(mp); +} + +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +int +xfs_inode_ag_iterator_tag( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args, + int tag) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + + ag = 0; + while ((pag = xfs_perag_get_tag(mp, ag, tag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + xfs_perag_put(pag); + if (error) { + last_error = error; + if (error == EFSCORRUPTED) + break; + } + } + return XFS_ERROR(last_error); +} + +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs periodic sync default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_reclaim_work_queue( + struct xfs_mount *mp) +{ + + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); + } + rcu_read_unlock(); +} + +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_reclaim_work_queue(mp); +} + +static void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_reclaim_work_queue(ip->i_mount); + + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + pag->pag_ici_reclaimable++; +} + +/* + * We set the inode flag atomically with the radix tree tag. + * Once we get tag lookups on the radix tree, this inode flag + * can go away. + */ +void +xfs_inode_set_reclaim_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + __xfs_inode_set_reclaim_tag(pag, ip); + __xfs_iflags_set(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +STATIC void +__xfs_inode_clear_reclaim( + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } +} + +STATIC void +__xfs_inode_clear_reclaim_tag( + xfs_mount_t *mp, + xfs_perag_t *pag, + xfs_inode_t *ip) +{ + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_clear_reclaim(pag, ip); +} + +/* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + ASSERT(rcu_read_lock_held()); + + /* quick check for stale RCU freed inode */ + if (!ip->i_ino) + return 1; + + /* + * If we are asked for non-blocking operation, do unlocked checks to + * see if the inode already is being flushed or in reclaim to avoid + * lock traffic. + */ + if ((flags & SYNC_TRYLOCK) && + __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) + return 1; + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + * + * Due to RCU lookup, we may find inodes that have been freed and only + * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that + * aren't candidates for reclaim at all, so we must check the + * XFS_IRECLAIMABLE is set first before proceeding to reclaim. + */ + spin_lock(&ip->i_flags_lock); + if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || + __xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* not a reclaim candidate. */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* + * Inodes in different states need to be treated differently. The following + * table lists the inode states and the reclaim actions necessary: + * + * inode state iflush ret required action + * --------------- ---------- --------------- + * bad - reclaim + * shutdown EIO unpin and reclaim + * clean, unpinned 0 reclaim + * stale, unpinned 0 reclaim + * clean, pinned(*) 0 requeue + * stale, pinned EAGAIN requeue + * dirty, async - requeue + * dirty, sync 0 reclaim + * + * (*) dgc: I don't think the clean, pinned state is possible but it gets + * handled anyway given the order of checks implemented. + * + * Also, because we get the flush lock first, we know that any inode that has + * been flushed delwri has had the flush completed by the time we check that + * the inode is clean. + * + * Note that because the inode is flushed delayed write by AIL pushing, the + * flush lock may already be held here and waiting on it can result in very + * long latencies. Hence for sync reclaims, where we wait on the flush lock, + * the caller should push the AIL first before trying to reclaim inodes to + * minimise the amount of time spent waiting. For background relaim, we only + * bother to reclaim clean inodes anyway. + * + * Hence the order of actions after gaining the locks should be: + * bad => reclaim + * shutdown => unpin and reclaim + * pinned, async => requeue + * pinned, sync => unpin + * stale => reclaim + * clean => reclaim + * dirty, async => requeue + * dirty, sync => flush, wait and reclaim + */ +STATIC int +xfs_reclaim_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int sync_mode) +{ + struct xfs_buf *bp = NULL; + int error; + +restart: + error = 0; + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iflock_nowait(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out; + xfs_iflock(ip); + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + xfs_iunpin_wait(ip); + xfs_iflush_abort(ip, false); + goto reclaim; + } + if (xfs_ipincount(ip)) { + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + xfs_iunpin_wait(ip); + } + if (xfs_iflags_test(ip, XFS_ISTALE)) + goto reclaim; + if (xfs_inode_clean(ip)) + goto reclaim; + + /* + * Never flush out dirty data during non-blocking reclaim, as it would + * just contend with AIL pushing trying to do the same job. + */ + if (!(sync_mode & SYNC_WAIT)) + goto out_ifunlock; + + /* + * Now we have an inode that needs flushing. + * + * Note that xfs_iflush will never block on the inode buffer lock, as + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_imap_to_bp() to get the cluster buffer would + * result in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error here, just unlock the + * inode, back off and try again. Hopefully the next pass through will + * see the stale flag set on the inode. + */ + error = xfs_iflush(ip, &bp); + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } + + if (!error) { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + + xfs_iflock(ip); +reclaim: + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + XFS_STATS_INC(xs_ig_reclaims); + /* + * Remove the inode from the per-AG radix tree. + * + * Because radix_tree_delete won't complain even if the item was never + * added to the tree assert that it's been there before to catch + * problems with the inode life time early on. + */ + spin_lock(&pag->pag_ici_lock); + if (!radix_tree_delete(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + ASSERT(0); + __xfs_inode_clear_reclaim(pag, ip); + spin_unlock(&pag->pag_ici_lock); + + /* + * Here we do an (almost) spurious inode lock in order to coordinate + * with inode cache radix tree lookups. This is because the lookup + * can reference the inodes in the cache without taking references. + * + * We make that OK here by ensuring that we wait until the inode is + * unlocked after the lookup before we go ahead and free it. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_qm_dqdetach(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + xfs_inode_free(ip); + return error; + +out_ifunlock: + xfs_ifunlock(ip); +out: + xfs_iflags_clear(ip, XFS_IRECLAIM); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* + * We could return EAGAIN here to make reclaim rescan the inode tree in + * a short while. However, this just burns CPU time scanning the tree + * waiting for IO to complete and the reclaim work never goes back to + * the idle state. Instead, return 0 to let the next scheduled + * background reclaim attempt to reclaim the inode again. + */ + return 0; +} + +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +STATIC int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + xfs_perag_put(pag); + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + done = 1; + rcu_read_unlock(); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + * + * Because we may see inodes that are from the + * wrong AG due to RCU freeing and + * reallocation, only update the index if it + * lies in this AG. It was a race that lead us + * to see this inode, so another lookup from + * the same index will not find it again. + */ + if (XFS_INO_TO_AGNO(mp, ip->i_ino) != + pag->pag_agno) + continue; + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + rcu_read_unlock(); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + cond_resched(); + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return XFS_ERROR(last_error); +} + +int +xfs_reclaim_inodes( + xfs_mount_t *mp, + int mode) +{ + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); +} + +/* + * Scan a certain number of inodes for reclaim. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. + */ +long +xfs_reclaim_inodes_nr( + struct xfs_mount *mp, + int nr_to_scan) +{ + /* kick background reclaimer and push the AIL */ + xfs_reclaim_work_queue(mp); + xfs_ail_push_all(mp->m_ail); + + return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); +} + +/* + * Return the number of reclaimable inodes in the filesystem for + * the shrinker to determine how much to reclaim. + */ +int +xfs_reclaim_inodes_count( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t ag = 0; + int reclaimable = 0; + + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + return reclaimable; +} + +STATIC int +xfs_inode_match_id( + struct xfs_inode *ip, + struct xfs_eofblocks *eofb) +{ + if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && + !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && + !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) + return 0; + + if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && + xfs_get_projid(ip) != eofb->eof_prid) + return 0; + + return 1; +} + +STATIC int +xfs_inode_free_eofblocks( + struct xfs_inode *ip, + int flags, + void *args) +{ + int ret; + struct xfs_eofblocks *eofb = args; + + if (!xfs_can_free_eofblocks(ip, false)) { + /* inode could be preallocated or append-only */ + trace_xfs_inode_free_eofblocks_invalid(ip); + xfs_inode_clear_eofblocks_tag(ip); + return 0; + } + + /* + * If the mapping is dirty the operation can block and wait for some + * time. Unless we are waiting, skip it. + */ + if (!(flags & SYNC_WAIT) && + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) + return 0; + + if (eofb) { + if (!xfs_inode_match_id(ip, eofb)) + return 0; + + /* skip the inode if the file size is too small */ + if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && + XFS_ISIZE(ip) < eofb->eof_min_file_size) + return 0; + } + + ret = xfs_free_eofblocks(ip->i_mount, ip, true); + + /* don't revisit the inode if we're not waiting */ + if (ret == EAGAIN && !(flags & SYNC_WAIT)) + ret = 0; + + return ret; +} + +int +xfs_icache_free_eofblocks( + struct xfs_mount *mp, + struct xfs_eofblocks *eofb) +{ + int flags = SYNC_TRYLOCK; + + if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) + flags = SYNC_WAIT; + + return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, + eofb, XFS_ICI_EOFBLOCKS_TAG); +} + +void +xfs_inode_set_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + int tagged; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_set_eofblocks_tag(ip); + + tagged = radix_tree_tagged(&pag->pag_ici_root, + XFS_ICI_EOFBLOCKS_TAG); + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!tagged) { + /* propagate the eofblocks tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + + /* kick off background trimming */ + xfs_queue_eofblocks(ip->i_mount); + + trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +void +xfs_inode_clear_eofblocks_tag( + xfs_inode_t *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + trace_xfs_inode_clear_eofblocks_tag(ip); + + radix_tree_tag_clear(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { + /* clear the eofblocks tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_EOFBLOCKS_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } + + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h new file mode 100644 index 00000000000..9cf017b899b --- /dev/null +++ b/fs/xfs/xfs_icache.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef XFS_SYNC_H +#define XFS_SYNC_H 1 + +struct xfs_mount; +struct xfs_perag; + +struct xfs_eofblocks { + __u32 eof_flags; + kuid_t eof_uid; + kgid_t eof_gid; + prid_t eof_prid; + __u64 eof_min_file_size; +}; + +#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ +#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ + +/* + * Flags for xfs_iget() + */ +#define XFS_IGET_CREATE 0x1 +#define XFS_IGET_UNTRUSTED 0x2 +#define XFS_IGET_DONTCACHE 0x4 + +int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, + uint flags, uint lock_flags, xfs_inode_t **ipp); + +/* recovery needs direct inode allocation capability */ +struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino); +void xfs_inode_free(struct xfs_inode *ip); + +void xfs_reclaim_worker(struct work_struct *work); + +int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); +int xfs_reclaim_inodes_count(struct xfs_mount *mp); +long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); + +void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); + +void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); +void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); +void xfs_eofblocks_worker(struct work_struct *); + +int xfs_inode_ag_iterator(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args); +int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int tag); + +static inline int +xfs_fs_eofblocks_from_user( + struct xfs_fs_eofblocks *src, + struct xfs_eofblocks *dst) +{ + if (src->eof_version != XFS_EOFBLOCKS_VERSION) + return EINVAL; + + if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) + return EINVAL; + + if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || + memchr_inv(src->pad64, 0, sizeof(src->pad64))) + return EINVAL; + + dst->eof_flags = src->eof_flags; + dst->eof_prid = src->eof_prid; + dst->eof_min_file_size = src->eof_min_file_size; + + dst->eof_uid = INVALID_UID; + if (src->eof_flags & XFS_EOF_FLAGS_UID) { + dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); + if (!uid_valid(dst->eof_uid)) + return EINVAL; + } + + dst->eof_gid = INVALID_GID; + if (src->eof_flags & XFS_EOF_FLAGS_GID) { + dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); + if (!gid_valid(dst->eof_gid)) + return EINVAL; + } + return 0; +} + +#endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c new file mode 100644 index 00000000000..7e454923325 --- /dev/null +++ b/fs/xfs/xfs_icreate_item.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2008-2010, 2013 Dave Chinner + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_error.h" +#include "xfs_icreate_item.h" +#include "xfs_log.h" + +kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) +{ + return container_of(lip, struct xfs_icreate_item, ic_item); +} + +/* + * This returns the number of iovecs needed to log the given inode item. + * + * We only need one iovec for the icreate log structure. + */ +STATIC void +xfs_icreate_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) +{ + *nvecs += 1; + *nbytes += sizeof(struct xfs_icreate_log); +} + +/* + * This is called to fill in the vector of log iovecs for the + * given inode create log item. + */ +STATIC void +xfs_icreate_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + struct xfs_log_iovec *vecp = NULL; + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE, + &icp->ic_format, + sizeof(struct xfs_icreate_log)); +} + + +/* Pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_pin( + struct xfs_log_item *lip) +{ +} + + +/* pinning has no meaning for the create item, so just return. */ +STATIC void +xfs_icreate_item_unpin( + struct xfs_log_item *lip, + int remove) +{ +} + +STATIC void +xfs_icreate_item_unlock( + struct xfs_log_item *lip) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + if (icp->ic_item.li_flags & XFS_LI_ABORTED) + kmem_zone_free(xfs_icreate_zone, icp); + return; +} + +/* + * Because we have ordered buffers being tracked in the AIL for the inode + * creation, we don't need the create item after this. Hence we can free + * the log item and return -1 to tell the caller we're done with the item. + */ +STATIC xfs_lsn_t +xfs_icreate_item_committed( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ + struct xfs_icreate_item *icp = ICR_ITEM(lip); + + kmem_zone_free(xfs_icreate_zone, icp); + return (xfs_lsn_t)-1; +} + +/* item can never get into the AIL */ +STATIC uint +xfs_icreate_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + ASSERT(0); + return XFS_ITEM_SUCCESS; +} + +/* Ordered buffers do the dependency tracking here, so this does nothing. */ +STATIC void +xfs_icreate_item_committing( + struct xfs_log_item *lip, + xfs_lsn_t lsn) +{ +} + +/* + * This is the ops vector shared by all buf log items. + */ +static struct xfs_item_ops xfs_icreate_item_ops = { + .iop_size = xfs_icreate_item_size, + .iop_format = xfs_icreate_item_format, + .iop_pin = xfs_icreate_item_pin, + .iop_unpin = xfs_icreate_item_unpin, + .iop_push = xfs_icreate_item_push, + .iop_unlock = xfs_icreate_item_unlock, + .iop_committed = xfs_icreate_item_committed, + .iop_committing = xfs_icreate_item_committing, +}; + + +/* + * Initialize the inode log item for a newly allocated (in-core) inode. + * + * Inode extents can only reside within an AG. Hence specify the starting + * block for the inode chunk by offset within an AG as well as the + * length of the allocated extent. + * + * This joins the item to the transaction and marks it dirty so + * that we don't need a separate call to do this, nor does the + * caller need to know anything about the icreate item. + */ +void +xfs_icreate_log( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + unsigned int count, + unsigned int inode_size, + xfs_agblock_t length, + unsigned int generation) +{ + struct xfs_icreate_item *icp; + + icp = kmem_zone_zalloc(xfs_icreate_zone, KM_SLEEP); + + xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, + &xfs_icreate_item_ops); + + icp->ic_format.icl_type = XFS_LI_ICREATE; + icp->ic_format.icl_size = 1; /* single vector */ + icp->ic_format.icl_ag = cpu_to_be32(agno); + icp->ic_format.icl_agbno = cpu_to_be32(agbno); + icp->ic_format.icl_count = cpu_to_be32(count); + icp->ic_format.icl_isize = cpu_to_be32(inode_size); + icp->ic_format.icl_length = cpu_to_be32(length); + icp->ic_format.icl_gen = cpu_to_be32(generation); + + xfs_trans_add_item(tp, &icp->ic_item); + tp->t_flags |= XFS_TRANS_DIRTY; + icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/xfs_icreate_item.h index 02de6e62ee3..59e89f87c09 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/xfs_icreate_item.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * Copyright (c) 2008-2010, Dave Chinner * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -15,10 +15,20 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef __XFS_IOCTL32_H__ -#define __XFS_IOCTL32_H__ +#ifndef XFS_ICREATE_ITEM_H +#define XFS_ICREATE_ITEM_H 1 -extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long); -extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long); +/* in memory log item structure */ +struct xfs_icreate_item { + struct xfs_log_item ic_item; + struct xfs_icreate_log ic_format; +}; -#endif /* __XFS_IOCTL32_H__ */ +extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ + +void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, + xfs_agblock_t agbno, unsigned int count, + unsigned int inode_size, xfs_agblock_t length, + unsigned int generation); + +#endif /* XFS_ICREATE_ITEM_H */ diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c deleted file mode 100644 index 0724df7fabb..00000000000 --- a/fs/xfs/xfs_iget.c +++ /dev/null @@ -1,1034 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_utils.h" - -/* - * Initialize the inode hash table for the newly mounted file system. - * Choose an initial table size based on user specified value, else - * use a simple algorithm using the maximum number of inodes as an - * indicator for table size, and clamp it between one and some large - * number of pages. - */ -void -xfs_ihash_init(xfs_mount_t *mp) -{ - __uint64_t icount; - uint i, flags = KM_SLEEP | KM_MAYFAIL; - - if (!mp->m_ihsize) { - icount = mp->m_maxicount ? mp->m_maxicount : - (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog); - mp->m_ihsize = 1 << max_t(uint, 8, - (xfs_highbit64(icount) + 1) / 2); - mp->m_ihsize = min_t(uint, mp->m_ihsize, - (64 * NBPP) / sizeof(xfs_ihash_t)); - } - - while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize * - sizeof(xfs_ihash_t), flags))) { - if ((mp->m_ihsize >>= 1) <= NBPP) - flags = KM_SLEEP; - } - for (i = 0; i < mp->m_ihsize; i++) { - rwlock_init(&(mp->m_ihash[i].ih_lock)); - } -} - -/* - * Free up structures allocated by xfs_ihash_init, at unmount time. - */ -void -xfs_ihash_free(xfs_mount_t *mp) -{ - kmem_free(mp->m_ihash, mp->m_ihsize*sizeof(xfs_ihash_t)); - mp->m_ihash = NULL; -} - -/* - * Initialize the inode cluster hash table for the newly mounted file system. - * Its size is derived from the ihash table size. - */ -void -xfs_chash_init(xfs_mount_t *mp) -{ - uint i; - - mp->m_chsize = max_t(uint, 1, mp->m_ihsize / - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)); - mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize); - mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize - * sizeof(xfs_chash_t), - KM_SLEEP); - for (i = 0; i < mp->m_chsize; i++) { - spinlock_init(&mp->m_chash[i].ch_lock,"xfshash"); - } -} - -/* - * Free up structures allocated by xfs_chash_init, at unmount time. - */ -void -xfs_chash_free(xfs_mount_t *mp) -{ - int i; - - for (i = 0; i < mp->m_chsize; i++) { - spinlock_destroy(&mp->m_chash[i].ch_lock); - } - - kmem_free(mp->m_chash, mp->m_chsize*sizeof(xfs_chash_t)); - mp->m_chash = NULL; -} - -/* - * Try to move an inode to the front of its hash list if possible - * (and if its not there already). Called right after obtaining - * the list version number and then dropping the read_lock on the - * hash list in question (which is done right after looking up the - * inode in question...). - */ -STATIC void -xfs_ihash_promote( - xfs_ihash_t *ih, - xfs_inode_t *ip, - ulong version) -{ - xfs_inode_t *iq; - - if ((ip->i_prevp != &ih->ih_next) && write_trylock(&ih->ih_lock)) { - if (likely(version == ih->ih_version)) { - /* remove from list */ - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - - /* insert at list head */ - iq = ih->ih_next; - iq->i_prevp = &ip->i_next; - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; - } - write_unlock(&ih->ih_lock); - } -} - -/* - * Look up an inode by number in the given file system. - * The inode is looked up in the hash table for the file system - * represented by the mount point parameter mp. Each bucket of - * the hash table is guarded by an individual semaphore. - * - * If the inode is found in the hash table, its corresponding vnode - * is obtained with a call to vn_get(). This call takes care of - * coordination with the reclamation of the inode and vnode. Note - * that the vmap structure is filled in while holding the hash lock. - * This gives us the state of the inode/vnode when we found it and - * is used for coordination in vn_get(). - * - * If it is not in core, read it in from the file system's device and - * add the inode into the hash table. - * - * The inode is locked according to the value of the lock_flags parameter. - * This flag parameter indicates how and if the inode's IO lock and inode lock - * should be taken. - * - * mp -- the mount point structure for the current file system. It points - * to the inode hash table. - * tp -- a pointer to the current transaction if there is one. This is - * simply passed through to the xfs_iread() call. - * ino -- the number of the inode desired. This is the unique identifier - * within the file system for the inode being requested. - * lock_flags -- flags indicating how to lock the inode. See the comment - * for xfs_ilock() for a list of valid values. - * bno -- the block number starting the buffer containing the inode, - * if known (as by bulkstat), else 0. - */ -STATIC int -xfs_iget_core( - bhv_vnode_t *vp, - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp, - xfs_daddr_t bno) -{ - xfs_ihash_t *ih; - xfs_inode_t *ip; - xfs_inode_t *iq; - bhv_vnode_t *inode_vp; - ulong version; - int error; - /* REFERENCED */ - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chlnew; - SPLDECL(s); - - - ih = XFS_IHASH(mp, ino); - -again: - read_lock(&ih->ih_lock); - - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { - /* - * If INEW is set this inode is being set up - * we need to pause and try again. - */ - if (ip->i_flags & XFS_INEW) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } - - inode_vp = XFS_ITOV_NULL(ip); - if (inode_vp == NULL) { - /* - * If IRECLAIM is set this inode is - * on its way out of the system, - * we need to pause and try again. - */ - if (ip->i_flags & XFS_IRECLAIM) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } - - vn_trace_exit(vp, "xfs_iget.alloc", - (inst_t *)__return_address); - - XFS_STATS_INC(xs_ig_found); - - ip->i_flags &= ~XFS_IRECLAIMABLE; - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - - XFS_MOUNT_ILOCK(mp); - list_del_init(&ip->i_reclaim); - XFS_MOUNT_IUNLOCK(mp); - - goto finish_inode; - - } else if (vp != inode_vp) { - struct inode *inode = vn_to_inode(inode_vp); - - /* The inode is being torn down, pause and - * try again. - */ - if (inode->i_state & (I_FREEING | I_CLEAR)) { - read_unlock(&ih->ih_lock); - delay(1); - XFS_STATS_INC(xs_ig_frecycle); - - goto again; - } -/* Chances are the other vnode (the one in the inode) is being torn - * down right now, and we landed on top of it. Question is, what do - * we do? Unhook the old inode and hook up the new one? - */ - cmn_err(CE_PANIC, - "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p", - inode_vp, vp); - } - - /* - * Inode cache hit: if ip is not at the front of - * its hash chain, move it there now. - * Do this with the lock held for update, but - * do statistics after releasing the lock. - */ - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - XFS_STATS_INC(xs_ig_found); - -finish_inode: - if (ip->i_d.di_mode == 0) { - if (!(flags & IGET_CREATE)) - return ENOENT; - xfs_iocore_inode_reinit(ip); - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - - ip->i_flags &= ~XFS_ISTALE; - - vn_trace_exit(vp, "xfs_iget.found", - (inst_t *)__return_address); - goto return_ip; - } - } - - /* - * Inode cache miss: save the hash chain version stamp and unlock - * the chain, so we don't deadlock in vn_alloc. - */ - XFS_STATS_INC(xs_ig_missed); - - version = ih->ih_version; - - read_unlock(&ih->ih_lock); - - /* - * Read the disk inode attributes into a new inode structure and get - * a new vnode for it. This should also initialize i_ino and i_mount. - */ - error = xfs_iread(mp, tp, ino, &ip, bno); - if (error) { - return error; - } - - vn_trace_exit(vp, "xfs_iget.alloc", (inst_t *)__return_address); - - xfs_inode_lock_init(ip, vp); - xfs_iocore_inode_init(ip); - - if (lock_flags != 0) { - xfs_ilock(ip, lock_flags); - } - - if ((ip->i_d.di_mode == 0) && !(flags & IGET_CREATE)) { - xfs_idestroy(ip); - return ENOENT; - } - - /* - * Put ip on its hash chain, unless someone else hashed a duplicate - * after we released the hash lock. - */ - write_lock(&ih->ih_lock); - - if (ih->ih_version != version) { - for (iq = ih->ih_next; iq != NULL; iq = iq->i_next) { - if (iq->i_ino == ino) { - write_unlock(&ih->ih_lock); - xfs_idestroy(ip); - - XFS_STATS_INC(xs_ig_dup); - goto again; - } - } - } - - /* - * These values _must_ be set before releasing ihlock! - */ - ip->i_hash = ih; - if ((iq = ih->ih_next)) { - iq->i_prevp = &ip->i_next; - } - ip->i_next = iq; - ip->i_prevp = &ih->ih_next; - ih->ih_next = ip; - ip->i_udquot = ip->i_gdquot = NULL; - ih->ih_version++; - ip->i_flags |= XFS_INEW; - - write_unlock(&ih->ih_lock); - - /* - * put ip on its cluster's hash chain - */ - ASSERT(ip->i_chash == NULL && ip->i_cprev == NULL && - ip->i_cnext == NULL); - - chlnew = NULL; - ch = XFS_CHASH(mp, ip->i_blkno); - chlredo: - s = mutex_spinlock(&ch->ch_lock); - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - - /* insert this inode into the doubly-linked list - * where chl points */ - if ((iq = chl->chl_ip)) { - ip->i_cprev = iq->i_cprev; - iq->i_cprev->i_cnext = ip; - iq->i_cprev = ip; - ip->i_cnext = iq; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - } - chl->chl_ip = ip; - ip->i_chash = chl; - break; - } - } - - /* no hash list found for this block; add a new hash list */ - if (chl == NULL) { - if (chlnew == NULL) { - mutex_spinunlock(&ch->ch_lock, s); - ASSERT(xfs_chashlist_zone != NULL); - chlnew = (xfs_chashlist_t *) - kmem_zone_alloc(xfs_chashlist_zone, - KM_SLEEP); - ASSERT(chlnew != NULL); - goto chlredo; - } else { - ip->i_cnext = ip; - ip->i_cprev = ip; - ip->i_chash = chlnew; - chlnew->chl_ip = ip; - chlnew->chl_blkno = ip->i_blkno; - if (ch->ch_list) - ch->ch_list->chl_prev = chlnew; - chlnew->chl_next = ch->ch_list; - chlnew->chl_prev = NULL; - ch->ch_list = chlnew; - chlnew = NULL; - } - } else { - if (chlnew != NULL) { - kmem_zone_free(xfs_chashlist_zone, chlnew); - } - } - - mutex_spinunlock(&ch->ch_lock, s); - - - /* - * Link ip to its mount and thread it on the mount's inode list. - */ - XFS_MOUNT_ILOCK(mp); - if ((iq = mp->m_inodes)) { - ASSERT(iq->i_mprev->i_mnext == iq); - ip->i_mprev = iq->i_mprev; - iq->i_mprev->i_mnext = ip; - iq->i_mprev = ip; - ip->i_mnext = iq; - } else { - ip->i_mnext = ip; - ip->i_mprev = ip; - } - mp->m_inodes = ip; - - XFS_MOUNT_IUNLOCK(mp); - - return_ip: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t)); - - ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == - ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); - - *ipp = ip; - - /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) - * now. If it's a new inode being created, xfs_ialloc will handle it. - */ - bhv_vfs_init_vnode(XFS_MTOVFS(mp), vp, XFS_ITOBHV(ip), 1); - - return 0; -} - - -/* - * The 'normal' internal xfs_iget, if needed it will - * 'allocate', or 'get', the vnode. - */ -int -xfs_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp, - xfs_daddr_t bno) -{ - struct inode *inode; - bhv_vnode_t *vp = NULL; - int error; - - XFS_STATS_INC(xs_ig_attempts); - -retry: - if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) { - xfs_inode_t *ip; - - vp = vn_from_inode(inode); - if (inode->i_state & I_NEW) { - vn_initialize(inode); - error = xfs_iget_core(vp, mp, tp, ino, flags, - lock_flags, ipp, bno); - if (error) { - vn_mark_bad(vp); - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - iput(inode); - } - } else { - /* - * If the inode is not fully constructed due to - * filehandle mismatches wait for the inode to go - * away and try again. - * - * iget_locked will call __wait_on_freeing_inode - * to wait for the inode to go away. - */ - if (is_bad_inode(inode) || - ((ip = xfs_vtoi(vp)) == NULL)) { - iput(inode); - delay(1); - goto retry; - } - - if (lock_flags != 0) - xfs_ilock(ip, lock_flags); - XFS_STATS_INC(xs_ig_found); - *ipp = ip; - error = 0; - } - } else - error = ENOMEM; /* If we got no inode we are out of memory */ - - return error; -} - -/* - * Do the setup for the various locks within the incore inode. - */ -void -xfs_inode_lock_init( - xfs_inode_t *ip, - bhv_vnode_t *vp) -{ - mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", (long)vp->v_number); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", vp->v_number); - init_waitqueue_head(&ip->i_ipin_wait); - atomic_set(&ip->i_pincount, 0); - init_sema(&ip->i_flock, 1, "xfsfino", vp->v_number); -} - -/* - * Look for the inode corresponding to the given ino in the hash table. - * If it is there and its i_transp pointer matches tp, return it. - * Otherwise, return NULL. - */ -xfs_inode_t * -xfs_inode_incore(xfs_mount_t *mp, - xfs_ino_t ino, - xfs_trans_t *tp) -{ - xfs_ihash_t *ih; - xfs_inode_t *ip; - ulong version; - - ih = XFS_IHASH(mp, ino); - read_lock(&ih->ih_lock); - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == ino) { - /* - * If we find it and tp matches, return it. - * Also move it to the front of the hash list - * if we find it and it is not already there. - * Otherwise break from the loop and return - * NULL. - */ - if (ip->i_transp == tp) { - version = ih->ih_version; - read_unlock(&ih->ih_lock); - xfs_ihash_promote(ih, ip, version); - return (ip); - } - break; - } - } - read_unlock(&ih->ih_lock); - return (NULL); -} - -/* - * Decrement reference count of an inode structure and unlock it. - * - * ip -- the inode being released - * lock_flags -- this parameter indicates the inode's locks to be - * to be released. See the comment on xfs_iunlock() for a list - * of valid values. - */ -void -xfs_iput(xfs_inode_t *ip, - uint lock_flags) -{ - bhv_vnode_t *vp = XFS_ITOV(ip); - - vn_trace_entry(vp, "xfs_iput", (inst_t *)__return_address); - xfs_iunlock(ip, lock_flags); - VN_RELE(vp); -} - -/* - * Special iput for brand-new inodes that are still locked - */ -void -xfs_iput_new(xfs_inode_t *ip, - uint lock_flags) -{ - bhv_vnode_t *vp = XFS_ITOV(ip); - struct inode *inode = vn_to_inode(vp); - - vn_trace_entry(vp, "xfs_iput_new", (inst_t *)__return_address); - - if ((ip->i_d.di_mode == 0)) { - ASSERT(!(ip->i_flags & XFS_IRECLAIMABLE)); - vn_mark_bad(vp); - } - if (inode->i_state & I_NEW) - unlock_new_inode(inode); - if (lock_flags) - xfs_iunlock(ip, lock_flags); - VN_RELE(vp); -} - - -/* - * This routine embodies the part of the reclaim code that pulls - * the inode from the inode hash table and the mount structure's - * inode list. - * This should only be called from xfs_reclaim(). - */ -void -xfs_ireclaim(xfs_inode_t *ip) -{ - bhv_vnode_t *vp; - - /* - * Remove from old hash list and mount list. - */ - XFS_STATS_INC(xs_ig_reclaims); - - xfs_iextract(ip); - - /* - * Here we do a spurious inode lock in order to coordinate with - * xfs_sync(). This is because xfs_sync() references the inodes - * in the mount list without taking references on the corresponding - * vnodes. We make that OK here by ensuring that we wait until - * the inode is unlocked in xfs_sync() before we go ahead and - * free it. We get both the regular lock and the io lock because - * the xfs_sync() code may need to drop the regular one but will - * still hold the io lock. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - /* - * Release dquots (and their references) if any. An inode may escape - * xfs_inactive and get here via vn_alloc->vn_reclaim path. - */ - XFS_QM_DQDETACH(ip->i_mount, ip); - - /* - * Pull our behavior descriptor from the vnode chain. - */ - vp = XFS_ITOV_NULL(ip); - if (vp) { - vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); - } - - /* - * Free all memory associated with the inode. - */ - xfs_idestroy(ip); -} - -/* - * This routine removes an about-to-be-destroyed inode from - * all of the lists in which it is located with the exception - * of the behavior chain. - */ -void -xfs_iextract( - xfs_inode_t *ip) -{ - xfs_ihash_t *ih; - xfs_inode_t *iq; - xfs_mount_t *mp; - xfs_chash_t *ch; - xfs_chashlist_t *chl, *chm; - SPLDECL(s); - - ih = ip->i_hash; - write_lock(&ih->ih_lock); - if ((iq = ip->i_next)) { - iq->i_prevp = ip->i_prevp; - } - *ip->i_prevp = iq; - ih->ih_version++; - write_unlock(&ih->ih_lock); - - /* - * Remove from cluster hash list - * 1) delete the chashlist if this is the last inode on the chashlist - * 2) unchain from list of inodes - * 3) point chashlist->chl_ip to 'chl_next' if to this inode. - */ - mp = ip->i_mount; - ch = XFS_CHASH(mp, ip->i_blkno); - s = mutex_spinlock(&ch->ch_lock); - - if (ip->i_cnext == ip) { - /* Last inode on chashlist */ - ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); - ASSERT(ip->i_chash != NULL); - chm=NULL; - chl = ip->i_chash; - if (chl->chl_prev) - chl->chl_prev->chl_next = chl->chl_next; - else - ch->ch_list = chl->chl_next; - if (chl->chl_next) - chl->chl_next->chl_prev = chl->chl_prev; - kmem_zone_free(xfs_chashlist_zone, chl); - } else { - /* delete one inode from a non-empty list */ - iq = ip->i_cnext; - iq->i_cprev = ip->i_cprev; - ip->i_cprev->i_cnext = iq; - if (ip->i_chash->chl_ip == ip) { - ip->i_chash->chl_ip = iq; - } - ip->i_chash = __return_address; - ip->i_cprev = __return_address; - ip->i_cnext = __return_address; - } - mutex_spinunlock(&ch->ch_lock, s); - - /* - * Remove from mount's inode list. - */ - XFS_MOUNT_ILOCK(mp); - ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL)); - iq = ip->i_mnext; - iq->i_mprev = ip->i_mprev; - ip->i_mprev->i_mnext = iq; - - /* - * Fix up the head pointer if it points to the inode being deleted. - */ - if (mp->m_inodes == ip) { - if (ip == iq) { - mp->m_inodes = NULL; - } else { - mp->m_inodes = iq; - } - } - - /* Deal with the deleted inodes list */ - list_del_init(&ip->i_reclaim); - - mp->m_ireclaims++; - XFS_MOUNT_IUNLOCK(mp); -} - -/* - * This is a wrapper routine around the xfs_ilock() routine - * used to centralize some grungy code. It is used in places - * that wish to lock the inode solely for reading the extents. - * The reason these places can't just call xfs_ilock(SHARED) - * is that the inode lock also guards to bringing in of the - * extents from disk for a file in b-tree format. If the inode - * is in b-tree format, then we need to lock the inode exclusively - * until the extents are read in. Locking it exclusively all - * the time would limit our parallelism unnecessarily, though. - * What we do instead is check to see if the extents have been - * read in yet, and only lock the inode exclusively if they - * have not. - * - * The function returns a value which should be given to the - * corresponding xfs_iunlock_map_shared(). This value is - * the mode in which the lock was actually taken. - */ -uint -xfs_ilock_map_shared( - xfs_inode_t *ip) -{ - uint lock_mode; - - if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) && - ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) { - lock_mode = XFS_ILOCK_EXCL; - } else { - lock_mode = XFS_ILOCK_SHARED; - } - - xfs_ilock(ip, lock_mode); - - return lock_mode; -} - -/* - * This is simply the unlock routine to go with xfs_ilock_map_shared(). - * All it does is call xfs_iunlock() with the given lock_mode. - */ -void -xfs_iunlock_map_shared( - xfs_inode_t *ip, - unsigned int lock_mode) -{ - xfs_iunlock(ip, lock_mode); -} - -/* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. - * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL - */ -void -xfs_ilock(xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); - - if (lock_flags & XFS_IOLOCK_EXCL) { - mrupdate(&ip->i_iolock); - } else if (lock_flags & XFS_IOLOCK_SHARED) { - mraccess(&ip->i_iolock); - } - if (lock_flags & XFS_ILOCK_EXCL) { - mrupdate(&ip->i_lock); - } else if (lock_flags & XFS_ILOCK_SHARED) { - mraccess(&ip->i_lock); - } - xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); -} - -/* - * This is just like xfs_ilock(), except that the caller - * is guaranteed not to sleep. It returns 1 if it gets - * the requested locks and 0 otherwise. If the IO lock is - * obtained but the inode lock cannot be, then the IO lock - * is dropped before returning. - * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks to be - * to be locked. See the comment for xfs_ilock() for a list - * of valid values. - * - */ -int -xfs_ilock_nowait(xfs_inode_t *ip, - uint lock_flags) -{ - int iolocked; - int ilocked; - - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~XFS_LOCK_MASK) == 0); - - iolocked = 0; - if (lock_flags & XFS_IOLOCK_EXCL) { - iolocked = mrtryupdate(&ip->i_iolock); - if (!iolocked) { - return 0; - } - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iolocked = mrtryaccess(&ip->i_iolock); - if (!iolocked) { - return 0; - } - } - if (lock_flags & XFS_ILOCK_EXCL) { - ilocked = mrtryupdate(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } - } else if (lock_flags & XFS_ILOCK_SHARED) { - ilocked = mrtryaccess(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } - } - xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); - return 1; -} - -/* - * xfs_iunlock() is used to drop the inode locks acquired with - * xfs_ilock() and xfs_ilock_nowait(). The caller must pass - * in the flags given to xfs_ilock() or xfs_ilock_nowait() so - * that we know which locks to drop. - * - * ip -- the inode being unlocked - * lock_flags -- this parameter indicates the inode's locks to be - * to be unlocked. See the comment for xfs_ilock() for a list - * of valid values for this parameter. - * - */ -void -xfs_iunlock(xfs_inode_t *ip, - uint lock_flags) -{ - /* - * You can't set both SHARED and EXCL for the same lock, - * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, - * and XFS_ILOCK_EXCL are valid values to set in lock_flags. - */ - ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != - (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); - ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != - (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_IUNLOCK_NONOTIFY)) == 0); - ASSERT(lock_flags != 0); - - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) || - (ismrlocked(&ip->i_iolock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) || - (ismrlocked(&ip->i_iolock, MR_UPDATE))); - mrunlock(&ip->i_iolock); - } - - if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_ILOCK_SHARED) || - (ismrlocked(&ip->i_lock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_ILOCK_EXCL) || - (ismrlocked(&ip->i_lock, MR_UPDATE))); - mrunlock(&ip->i_lock); - - /* - * Let the AIL know that this item has been unlocked in case - * it is in the AIL and anyone is waiting on it. Don't do - * this if the caller has asked us not to. - */ - if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) && - ip->i_itemp != NULL) { - xfs_trans_unlocked_item(ip->i_mount, - (xfs_log_item_t*)(ip->i_itemp)); - } - } - xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); -} - -/* - * give up write locks. the i/o lock cannot be held nested - * if it is being demoted. - */ -void -xfs_ilock_demote(xfs_inode_t *ip, - uint lock_flags) -{ - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - - if (lock_flags & XFS_ILOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - mrdemote(&ip->i_lock); - } - if (lock_flags & XFS_IOLOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); - mrdemote(&ip->i_iolock); - } -} - -/* - * The following three routines simply manage the i_flock - * semaphore embedded in the inode. This semaphore synchronizes - * processes attempting to flush the in-core inode back to disk. - */ -void -xfs_iflock(xfs_inode_t *ip) -{ - psema(&(ip->i_flock), PINOD|PLTWAIT); -} - -int -xfs_iflock_nowait(xfs_inode_t *ip) -{ - return (cpsema(&(ip->i_flock))); -} - -void -xfs_ifunlock(xfs_inode_t *ip) -{ - ASSERT(issemalocked(&(ip->i_flock))); - vsema(&(ip->i_flock)); -} diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h deleted file mode 100644 index d3645000398..00000000000 --- a/fs/xfs/xfs_imap.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2000,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_IMAP_H__ -#define __XFS_IMAP_H__ - -/* - * This is the structure passed to xfs_imap() to map - * an inode number to its on disk location. - */ -typedef struct xfs_imap { - xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - uint im_len; /* length in BBs of inode chunk */ - xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */ - ushort im_ioffset; /* inode offset in block in "inodes" */ - ushort im_boffset; /* inode offset in block in bytes */ -} xfs_imap_t; - -#ifdef __KERNEL__ -struct xfs_mount; -struct xfs_trans; -int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_imap_t *, uint); -#endif - -#endif /* __XFS_IMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f8ecff8553..a6115fe1ac9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -15,773 +15,494 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include <linux/log2.h> + #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_imap.h" -#include "xfs_trans.h" -#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_attr_sf.h" +#include "xfs_attr.h" +#include "xfs_trans_space.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" #include "xfs_ialloc.h" #include "xfs_bmap.h" -#include "xfs_rw.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_utils.h" -#include "xfs_dir2_trace.h" #include "xfs_quota.h" -#include "xfs_mac.h" -#include "xfs_acl.h" - +#include "xfs_filestream.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_bmap_btree.h" -kmem_zone_t *xfs_ifork_zone; kmem_zone_t *xfs_inode_zone; -kmem_zone_t *xfs_chashlist_zone; /* - * Used in xfs_itruncate(). This is the maximum number of extents + * Used in xfs_itruncate_extents(). This is the maximum number of extents * freed from a file in a single transaction. */ #define XFS_ITRUNC_MAX_EXTENTS 2 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *); -STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); -STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); -STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *); -#ifdef DEBUG /* - * Make sure that the extents in the given memory buffer - * are valid. + * helper function to extract extent size hint from inode */ -STATIC void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - int disk, - xfs_exntfmt_t fmt) -{ - xfs_bmbt_rec_t *ep; - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_t rec; - int i; - - for (i = 0; i < nrecs; i++) { - ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned((__uint64_t*)&ep->l0); - rec.l1 = get_unaligned((__uint64_t*)&ep->l1); - if (disk) - xfs_bmbt_disk_get_all(&rec, &irec); - else - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } +xfs_extlen_t +xfs_get_extsz_hint( + struct xfs_inode *ip) +{ + if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize) + return ip->i_d.di_extsize; + if (XFS_IS_REALTIME_INODE(ip)) + return ip->i_mount->m_sb.sb_rextsize; + return 0; } -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, disk, fmt) -#endif /* DEBUG */ /* - * Check that none of the inode's in the buffer have a next - * unlinked field of 0. + * These two are wrapper routines around the xfs_ilock() routine used to + * centralize some grungy code. They are used in places that wish to lock the + * inode solely for reading the extents. The reason these places can't just + * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to + * bringing in of the extents from disk for a file in b-tree format. If the + * inode is in b-tree format, then we need to lock the inode exclusively until + * the extents are read in. Locking it exclusively all the time would limit + * our parallelism unnecessarily, though. What we do instead is check to see + * if the extents have been read in yet, and only lock the inode exclusively + * if they have not. + * + * The functions return a value which should be given to the corresponding + * xfs_iunlock() call. */ -#if defined(DEBUG) -void -xfs_inobp_check( - xfs_mount_t *mp, - xfs_buf_t *bp) +uint +xfs_ilock_data_map_shared( + struct xfs_inode *ip) { - int i; - int j; - xfs_dinode_t *dip; + uint lock_mode = XFS_ILOCK_SHARED; - j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + (ip->i_df.if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; +} - for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); - if (!dip->di_next_unlinked) { - xfs_fs_cmn_err(CE_ALERT, mp, - "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", - bp); - ASSERT(dip->di_next_unlinked); - } - } +uint +xfs_ilock_attr_map_shared( + struct xfs_inode *ip) +{ + uint lock_mode = XFS_ILOCK_SHARED; + + if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE && + (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0) + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_mode); + return lock_mode; } -#endif /* - * This routine is called to map an inode number within a file - * system to the buffer containing the on-disk version of the - * inode. It returns a pointer to the buffer containing the - * on-disk inode in the bpp parameter, and in the dip parameter - * it returns a pointer to the on-disk inode within that buffer. + * The xfs inode contains 2 locks: a multi-reader lock called the + * i_iolock and a multi-reader lock called the i_lock. This routine + * allows either or both of the locks to be obtained. * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. + * The 2 locks should always be ordered so that the IO lock is + * obtained first in order to prevent deadlock. * - * Use xfs_imap() to determine the size and location of the - * buffer to read from disk. + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks + * to be locked. It can be: + * XFS_IOLOCK_SHARED, + * XFS_IOLOCK_EXCL, + * XFS_ILOCK_SHARED, + * XFS_ILOCK_EXCL, + * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, + * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, + * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, + * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL */ -STATIC int -xfs_inotobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - int *offset) +void +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) { - int di_ok; - xfs_imap_t imap; - xfs_buf_t *bp; - int error; - xfs_dinode_t *dip; + trace_xfs_ilock(ip, lock_flags, _RET_IP_); /* - * Call the space management code to find the location of the - * inode on disk. + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. */ - imap.im_blkno = 0; - error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); - if (error != 0) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_imap() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); - return error; - } + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - /* - * If the inode number maps to a block outside the bounds of the - * file system then return NULL rather than calling read_buf - * and panicing when we get an error from the driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { - cmn_err(CE_WARN, - "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds " - "of the file system %s. Returning EINVAL.", - (unsigned long long)imap.im_blkno, - imap.im_len, mp->m_fsname); - return XFS_ERROR(EINVAL); - } + if (lock_flags & XFS_IOLOCK_EXCL) + mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_IOLOCK_SHARED) + mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + + if (lock_flags & XFS_ILOCK_EXCL) + mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); + else if (lock_flags & XFS_ILOCK_SHARED) + mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); +} + +/* + * This is just like xfs_ilock(), except that the caller + * is guaranteed not to sleep. It returns 1 if it gets + * the requested locks and 0 otherwise. If the IO lock is + * obtained but the inode lock cannot be, then the IO lock + * is dropped before returning. + * + * ip -- the inode being locked + * lock_flags -- this parameter indicates the inode's locks to be + * to be locked. See the comment for xfs_ilock() for a list + * of valid values. + */ +int +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) +{ + trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); + + /* + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. + */ + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + + if (lock_flags & XFS_IOLOCK_EXCL) { + if (!mrtryupdate(&ip->i_iolock)) + goto out; + } else if (lock_flags & XFS_IOLOCK_SHARED) { + if (!mrtryaccess(&ip->i_iolock)) + goto out; + } + if (lock_flags & XFS_ILOCK_EXCL) { + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_ILOCK_SHARED) { + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; + } + return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; +} +/* + * xfs_iunlock() is used to drop the inode locks acquired with + * xfs_ilock() and xfs_ilock_nowait(). The caller must pass + * in the flags given to xfs_ilock() or xfs_ilock_nowait() so + * that we know which locks to drop. + * + * ip -- the inode being unlocked + * lock_flags -- this parameter indicates the inode's locks to be + * to be unlocked. See the comment for xfs_ilock() for a list + * of valid values for this parameter. + * + */ +void +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) +{ /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. + * You can't set both SHARED and EXCL for the same lock, + * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, + * and XFS_ILOCK_EXCL are valid values to set in lock_flags. */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); + ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != + (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != + (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); + ASSERT(lock_flags != 0); - if (error) { - cmn_err(CE_WARN, - "xfs_inotobp: xfs_trans_read_buf() returned an " - "error %d on %s. Returning error.", error, mp->m_fsname); - return error; - } - dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0); - di_ok = - INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip); - xfs_trans_brelse(tp, bp); - cmn_err(CE_WARN, - "xfs_inotobp: XFS_TEST_ERROR() returned an " - "error on %s. Returning EFSCORRUPTED.", mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); - xfs_inobp_check(mp, bp); + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); - *bpp = bp; - *offset = imap.im_boffset; - return 0; + trace_xfs_iunlock(ip, lock_flags, _RET_IP_); } - /* - * This routine is called to map an inode to the buffer containing - * the on-disk version of the inode. It returns a pointer to the - * buffer containing the on-disk inode in the bpp parameter, and in - * the dip parameter it returns a pointer to the on-disk inode within - * that buffer. - * - * If a non-zero error is returned, then the contents of bpp and - * dipp are undefined. - * - * If the inode is new and has not yet been initialized, use xfs_imap() - * to determine the size and location of the buffer to read from disk. - * If the inode has already been mapped to its buffer and read in once, - * then use the mapping information stored in the inode rather than - * calling xfs_imap(). This allows us to avoid the overhead of looking - * at the inode btree for small block file systems (see xfs_dilocate()). - * We can tell whether the inode has been mapped in before by comparing - * its disk block address to 0. Only uninitialized inodes will have - * 0 for the disk block address. + * give up write locks. the i/o lock cannot be held nested + * if it is being demoted. */ -int -xfs_itobp( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_dinode_t **dipp, - xfs_buf_t **bpp, - xfs_daddr_t bno, - uint imap_flags) +void +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) { - xfs_imap_t imap; - xfs_buf_t *bp; - int error; - int i; - int ni; + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - if (ip->i_blkno == (xfs_daddr_t)0) { - /* - * Call the space management code to find the location of the - * inode on disk. - */ - imap.im_blkno = bno; - if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, - XFS_IMAP_LOOKUP | imap_flags))) - return error; + if (lock_flags & XFS_ILOCK_EXCL) + mrdemote(&ip->i_lock); + if (lock_flags & XFS_IOLOCK_EXCL) + mrdemote(&ip->i_iolock); - /* - * If the inode number maps to a block outside the bounds - * of the file system then return NULL rather than calling - * read_buf and panicing when we get an error from the - * driver. - */ - if ((imap.im_blkno + imap.im_len) > - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "(imap.im_blkno (0x%llx) " - "+ imap.im_len (0x%llx)) > " - " XFS_FSB_TO_BB(mp, " - "mp->m_sb.sb_dblocks) (0x%llx)", - (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len, - XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); -#endif /* DEBUG */ - return XFS_ERROR(EINVAL); - } + trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); +} - /* - * Fill in the fields in the inode that will be used to - * map the inode to its buffer from now on. - */ - ip->i_blkno = imap.im_blkno; - ip->i_len = imap.im_len; - ip->i_boffset = imap.im_boffset; - } else { - /* - * We've already mapped the inode once, so just use the - * mapping that we saved the first time. - */ - imap.im_blkno = ip->i_blkno; - imap.im_len = ip->i_len; - imap.im_boffset = ip->i_boffset; +#if defined(DEBUG) || defined(XFS_WARN) +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { + if (!(lock_flags & XFS_ILOCK_SHARED)) + return !!ip->i_lock.mr_writer; + return rwsem_is_locked(&ip->i_lock.mr_lock); } - ASSERT(bno == 0 || bno == imap.im_blkno); - /* - * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will - * default to just a read_buf() call. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno, - (int)imap.im_len, XFS_BUF_LOCK, &bp); - if (error) { -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: " - "xfs_trans_read_buf() returned error %d, " - "imap.im_blkno 0x%llx, imap.im_len 0x%llx", - error, (unsigned long long) imap.im_blkno, - (unsigned long long) imap.im_len); -#endif /* DEBUG */ - return error; + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { + if (!(lock_flags & XFS_IOLOCK_SHARED)) + return !!ip->i_iolock.mr_writer; + return rwsem_is_locked(&ip->i_iolock.mr_lock); } - /* - * Validate the magic number and version of every inode in the buffer - * (if DEBUG kernel) or the first inode in the buffer, otherwise. - * No validation is done here in userspace (xfs_repair). - */ -#if !defined(__KERNEL__) - ni = 0; -#elif defined(DEBUG) - ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog; -#else /* usual case */ - ni = 1; + ASSERT(0); + return 0; +} #endif - for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); - di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC && - XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT)); - if (unlikely(XFS_TEST_ERROR(!di_ok, mp, - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (imap_flags & XFS_IMAP_BULKSTAT) { - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EINVAL); - } #ifdef DEBUG - cmn_err(CE_ALERT, - "Device %s - bad inode magic/vsn " - "daddr %lld #%d (magic=%x)", - XFS_BUFTARG_NAME(mp->m_ddev_targp), - (unsigned long long)imap.im_blkno, i, - INT_GET(dip->di_core.di_magic, ARCH_CONVERT)); +int xfs_locked_n; +int xfs_small_retries; +int xfs_middle_retries; +int xfs_lots_retries; +int xfs_lock_delays; #endif - XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH, - mp, dip); - xfs_trans_brelse(tp, bp); - return XFS_ERROR(EFSCORRUPTED); - } - } - - xfs_inobp_check(mp, bp); - /* - * Mark the buffer as an inode buffer now that it looks good - */ - XFS_BUF_SET_VTYPE(bp, B_FS_INO); +/* + * Bump the subclass so xfs_lock_inodes() acquires each lock with + * a different value + */ +static inline int +xfs_lock_inumorder(int lock_mode, int subclass) +{ + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; - /* - * Set *dipp to point to the on-disk inode in the buffer. - */ - *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); - *bpp = bp; - return 0; + return lock_mode; } /* - * Move inode type and inode format specific information from the - * on-disk inode to the in-core inode. For fifos, devs, and sockets - * this means set if_rdev to the proper value. For files, directories, - * and symlinks this means to bring in the in-line data or extent - * pointers. For a file in B-tree format, only the root is immediately - * brought in-core. The rest will be in-lined in if_extents when it - * is first referenced (see xfs_iread_extents()). + * The following routine will lock n inodes in exclusive mode. + * We assume the caller calls us with the inodes in i_ino order. + * + * We need to detect deadlock where an inode that we lock + * is in the AIL and we start waiting for another inode that is locked + * by a thread in a long running transaction (such as truncate). This can + * result in deadlock since the long running trans might need to wait + * for the inode we just locked in order to push the tail and free space + * in the log. */ -STATIC int -xfs_iformat( - xfs_inode_t *ip, - xfs_dinode_t *dip) +void +xfs_lock_inodes( + xfs_inode_t **ips, + int inodes, + uint lock_mode) { - xfs_attr_shortform_t *atp; - int size; - int error; - xfs_fsize_t di_size; - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - error = 0; + int attempts = 0, i, j, try_lock; + xfs_log_item_t *lp; - if (unlikely( - INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) + - INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) > - INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", - (unsigned long long)ip->i_ino, - (int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) - + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)), - (unsigned long long) - INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT)); - XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + ASSERT(ips && (inodes >= 2)); /* we need at least two */ - if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt dinode %Lu, forkoff = 0x%x.", - (unsigned long long)ip->i_ino, - (int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT))); - XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + try_lock = 0; + i = 0; - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFIFO: - case S_IFCHR: - case S_IFBLK: - case S_IFSOCK: - if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { - XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ip->i_d.di_size = 0; - ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); - break; +again: + for (; i < inodes; i++) { + ASSERT(ips[i]); - case S_IFREG: - case S_IFLNK: - case S_IFDIR: - switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { - case XFS_DINODE_FMT_LOCAL: - /* - * no local regular files yet - */ - if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu " - "(local format for regular file).", - (unsigned long long) ip->i_ino); - XFS_CORRUPTION_ERROR("xfs_iformat(4)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + if (i && (ips[i] == ips[i-1])) /* Already locked */ + continue; - di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT); - if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu " - "(bad size %Ld for local inode).", - (unsigned long long) ip->i_ino, - (long long) di_size); - XFS_CORRUPTION_ERROR("xfs_iformat(5)", - XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + /* + * If try_lock is not set yet, make sure all locked inodes + * are not in the AIL. + * If any are, set try_lock to be used later. + */ - size = (int)di_size; - error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); - break; - default: - XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + if (!try_lock) { + for (j = (i - 1); j >= 0 && !try_lock; j--) { + lp = (xfs_log_item_t *)ips[j]->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + try_lock++; + } + } } - break; - default: - XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - if (error) { - return error; - } - if (!XFS_DFORK_Q(dip)) - return 0; - ASSERT(ip->i_afp == NULL); - ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); - ip->i_afp->if_ext_max = - XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) { - case XFS_DINODE_FMT_LOCAL: - atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); - size = be16_to_cpu(atp->hdr.totsize); - error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); - break; - case XFS_DINODE_FMT_EXTENTS: - error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); - break; - case XFS_DINODE_FMT_BTREE: - error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); - break; - default: - error = XFS_ERROR(EFSCORRUPTED); - break; - } - if (error) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - xfs_idestroy_fork(ip, XFS_DATA_FORK); - } - return error; -} + /* + * If any of the previous locks we have locked is in the AIL, + * we must TRY to get the second and subsequent locks. If + * we can't get any, we must release all we have + * and try again. + */ -/* - * The file is in-lined in the on-disk inode. - * If it fits into if_inline_data, then copy - * it there, otherwise allocate a buffer for it - * and copy the data there. Either way, set - * if_data to point at the data. - * If we allocate a buffer for the data, make - * sure that its size is a multiple of 4 and - * record the real size in i_real_bytes. - */ -STATIC int -xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) -{ - xfs_ifork_t *ifp; - int real_size; + if (try_lock) { + /* try_lock must be 0 if i is 0. */ + /* + * try_lock means we have an inode locked + * that is in the AIL. + */ + ASSERT(i != 0); + if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { + attempts++; - /* - * If the size is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu " - "(bad size %d for local fork, size = %d).", - (unsigned long long) ip->i_ino, size, - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); - XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } - ifp = XFS_IFORK_PTR(ip, whichfork); - real_size = 0; - if (size == 0) - ifp->if_u1.if_data = NULL; - else if (size <= sizeof(ifp->if_u2.if_inline_data)) - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - else { - real_size = roundup(size, 4); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); - } - ifp->if_bytes = size; - ifp->if_real_bytes = real_size; - if (size) - memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFINLINE; - return 0; -} + /* + * Unlock all previous guys and try again. + * xfs_iunlock will try to push the tail + * if the inode is in the AIL. + */ -/* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. - */ -STATIC int -xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) -{ - xfs_bmbt_rec_t *ep, *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; + for(j = i - 1; j >= 0; j--) { - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); + /* + * Check to see if we've already + * unlocked this one. + * Not the first one going back, + * and the inode ptr is the same. + */ + if ((j != (i - 1)) && ips[j] == + ips[j+1]) + continue; - /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. - */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu ((a)extents = %d).", - (unsigned long long) ip->i_ino, nex); - XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); - return XFS_ERROR(EFSCORRUPTED); - } + xfs_iunlock(ips[j], lock_mode); + } - ifp->if_real_bytes = 0; - if (nex == 0) - ifp->if_u1.if_extents = NULL; - else if (nex <= XFS_INLINE_EXTS) - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - else - xfs_iext_add(ifp, 0, nex); - - ifp->if_bytes = size; - if (size) { - dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, 1, XFS_EXTFMT_INODE(ip)); - for (i = 0; i < nex; i++, dp++) { - ep = xfs_iext_get_ext(ifp, i); - ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0), - ARCH_CONVERT); - ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1), - ARCH_CONVERT); - } - xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex, - whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ +#ifdef DEBUG + xfs_lock_delays++; +#endif } + i = 0; + try_lock = 0; + goto again; + } + } else { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + } } - ifp->if_flags |= XFS_IFEXTENTS; - return 0; + +#ifdef DEBUG + if (attempts) { + if (attempts < 5) xfs_small_retries++; + else if (attempts < 100) xfs_middle_retries++; + else xfs_lots_retries++; + } else { + xfs_locked_n++; + } +#endif } /* - * The file has too many extents to fit into - * the inode, so they are in B-tree format. - * Allocate a buffer for the root of the B-tree - * and copy the root into it. The i_extents - * field will remain NULL until all of the - * extents are read in (when they are needed). + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. */ -STATIC int -xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) +void +xfs_lock_two_inodes( + xfs_inode_t *ip0, + xfs_inode_t *ip1, + uint lock_mode) { - xfs_bmdr_block_t *dfp; - xfs_ifork_t *ifp; - /* REFERENCED */ - int nrecs; - int size; + xfs_inode_t *temp; + int attempts = 0; + xfs_log_item_t *lp; - ifp = XFS_IFORK_PTR(ip, whichfork); - dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); - size = XFS_BMAP_BROOT_SPACE(dfp); - nrecs = XFS_BMAP_BROOT_NUMRECS(dfp); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + ASSERT(ip0->i_ino != ip1->i_ino); - /* - * blow out if -- fork has less extents than can fit in - * fork (fork shouldn't be a btree format), root btree - * block has more records than can fit into the fork, - * or the number of extents is greater than the number of - * blocks. - */ - if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max - || XFS_BMDR_SPACE_CALC(nrecs) > - XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) - || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { - xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, - "corrupt inode %Lu (btree).", - (unsigned long long) ip->i_ino); - XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); + if (ip0->i_ino > ip1->i_ino) { + temp = ip0; + ip0 = ip1; + ip1 = temp; } - ifp->if_broot_bytes = size; - ifp->if_broot = kmem_alloc(size, KM_SLEEP); - ASSERT(ifp->if_broot != NULL); + again: + xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0)); + /* - * Copy and convert from the on-disk structure - * to the in-memory structure. + * If the first lock we have locked is in the AIL, we must TRY to get + * the second lock. If we can't get it, we must release the first one + * and try again. */ - xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), - ifp->if_broot, size); - ifp->if_flags &= ~XFS_IFEXTENTS; - ifp->if_flags |= XFS_IFBROOT; - - return 0; + lp = (xfs_log_item_t *)ip0->i_itemp; + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) { + xfs_iunlock(ip0, lock_mode); + if ((++attempts % 5) == 0) + delay(1); /* Don't just spin the CPU */ + goto again; + } + } else { + xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1)); + } } -/* - * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk - * and native format - * - * buf = on-disk representation - * dip = native representation - * dir = direction - +ve -> disk to native - * -ve -> native to disk - */ + void -xfs_xlate_dinode_core( - xfs_caddr_t buf, - xfs_dinode_core_t *dip, - int dir) +__xfs_iflock( + struct xfs_inode *ip) { - xfs_dinode_core_t *buf_core = (xfs_dinode_core_t *)buf; - xfs_dinode_core_t *mem_core = (xfs_dinode_core_t *)dip; - xfs_arch_t arch = ARCH_CONVERT; - - ASSERT(dir); - - INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch); - INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch); - INT_XLATE(buf_core->di_version, mem_core->di_version, dir, arch); - INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch); - INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch); - INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch); - INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch); - INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch); - INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch); - - if (dir > 0) { - memcpy(mem_core->di_pad, buf_core->di_pad, - sizeof(buf_core->di_pad)); - } else { - memcpy(buf_core->di_pad, mem_core->di_pad, - sizeof(buf_core->di_pad)); - } + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT); + + do { + prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_isiflocked(ip)) + io_schedule(); + } while (!xfs_iflock_nowait(ip)); - INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch); - - INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec, - dir, arch); - INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec, - dir, arch); - INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch); - INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch); - INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch); - INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch); - INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch); - INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch); - INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch); - INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch); - INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch); - INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch); - INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch); + finish_wait(wq, &wait.wait); } STATIC uint @@ -817,6 +538,8 @@ _xfs_dic2xflags( flags |= XFS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) flags |= XFS_XFLAG_NODEFRAG; + if (di_flags & XFS_DIFLAG_FILESTREAM) + flags |= XFS_XFLAG_FILESTREAM; } return flags; @@ -826,224 +549,61 @@ uint xfs_ip2xflags( xfs_inode_t *ip) { - xfs_dinode_core_t *dic = &ip->i_d; + xfs_icdinode_t *dic = &ip->i_d; return _xfs_dic2xflags(dic->di_flags) | - (XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0); + (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); } uint xfs_dic2xflags( - xfs_dinode_core_t *dic) + xfs_dinode_t *dip) { - return _xfs_dic2xflags(INT_GET(dic->di_flags, ARCH_CONVERT)) | - (XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | + (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); } /* - * Given a mount structure and an inode number, return a pointer - * to a newly allocated in-core inode corresponding to the given - * inode number. - * - * Initialize the inode's attributes and extent pointers if it - * already has them (it will not if the inode has no links). + * Lookups up an inode from "name". If ci_name is not NULL, then a CI match + * is allowed, otherwise it has to be an exact match. If a CI match is found, + * ci_name->name will point to a the actual name (caller must free) or + * will be set to NULL if an exact match is found. */ int -xfs_iread( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_inode_t **ipp, - xfs_daddr_t bno) +xfs_lookup( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t **ipp, + struct xfs_name *ci_name) { - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_inode_t *ip; - int error; - - ASSERT(xfs_inode_zone != NULL); - - ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP); - ip->i_ino = ino; - ip->i_mount = mp; - - /* - * Get pointer's to the on-disk inode and the buffer containing it. - * If the inode number refers to a block outside the file system - * then xfs_itobp() will return NULL. In this case we should - * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will - * know that this is a new incore inode. - */ - error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, 0); - if (error) { - kmem_zone_free(xfs_inode_zone, ip); - return error; - } - - /* - * Initialize inode's trace buffers. - * Do this before xfs_iformat in case it adds entries. - */ -#ifdef XFS_BMAP_TRACE - ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMBT_TRACE - ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_RW_TRACE - ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_ILOCK_TRACE - ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_DIR2_TRACE - ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP); -#endif + xfs_ino_t inum; + int error; + uint lock_mode; - /* - * If we got something that isn't an inode it means someone - * (nfs or dmi) has a stale handle. - */ - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) { - kmem_zone_free(xfs_inode_zone, ip); - xfs_trans_brelse(tp, bp); -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " - "dip->di_core.di_magic (0x%x) != " - "XFS_DINODE_MAGIC (0x%x)", - INT_GET(dip->di_core.di_magic, ARCH_CONVERT), - XFS_DINODE_MAGIC); -#endif /* DEBUG */ - return XFS_ERROR(EINVAL); - } + trace_xfs_lookup(dp, name); - /* - * If the on-disk inode is already linked to a directory - * entry, copy all of the inode into the in-core inode. - * xfs_iformat() handles copying in the inode format - * specific information. - * Otherwise, just get the truly permanent information. - */ - if (dip->di_core.di_mode) { - xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, - &(ip->i_d), 1); - error = xfs_iformat(ip, dip); - if (error) { - kmem_zone_free(xfs_inode_zone, ip); - xfs_trans_brelse(tp, bp); -#ifdef DEBUG - xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " - "xfs_iformat() returned error %d", - error); -#endif /* DEBUG */ - return error; - } - } else { - ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT); - ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT); - ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT); - ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT); - /* - * Make sure to pull in the mode here as well in - * case the inode is released without being used. - * This ensures that xfs_inactive() will see that - * the inode is already free and not try to mess - * with the uninitialized part of it. - */ - ip->i_d.di_mode = 0; - /* - * Initialize the per-fork minima and maxima for a new - * inode here. xfs_iformat will do it for old inodes. - */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); - } - - INIT_LIST_HEAD(&ip->i_reclaim); + if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + return XFS_ERROR(EIO); - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { - ip->i_d.di_nlink = ip->i_d.di_onlink; - ip->i_d.di_onlink = 0; - ip->i_d.di_projid = 0; - } + lock_mode = xfs_ilock_data_map_shared(dp); + error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); + xfs_iunlock(dp, lock_mode); - ip->i_delayed_blks = 0; + if (error) + goto out; - /* - * Mark the buffer containing the inode as something to keep - * around for a while. This helps to keep recently accessed - * meta-data in-core longer. - */ - XFS_BUF_SET_REF(bp, XFS_INO_REF); + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); + if (error) + goto out_free_name; - /* - * Use xfs_trans_brelse() to release the buffer containing the - * on-disk inode, because it was acquired with xfs_trans_read_buf() - * in xfs_itobp() above. If tp is NULL, this is just a normal - * brelse(). If we're within a transaction, then xfs_trans_brelse() - * will only release the buffer if it is not dirty within the - * transaction. It will be OK to release the buffer in this case, - * because inodes on disk are never destroyed and we will be - * locking the new in-core inode before putting it in the hash - * table where other processes can find it. Thus we don't have - * to worry about the inode being changed just because we released - * the buffer. - */ - xfs_trans_brelse(tp, bp); - *ipp = ip; return 0; -} - -/* - * Read in extents from a btree-format inode. - * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. - */ -int -xfs_iread_extents( - xfs_trans_t *tp, - xfs_inode_t *ip, - int whichfork) -{ - int error; - xfs_ifork_t *ifp; - xfs_extnum_t nextents; - size_t size; - - if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { - XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, - ip->i_mount); - return XFS_ERROR(EFSCORRUPTED); - } - nextents = XFS_IFORK_NEXTENTS(ip, whichfork); - size = nextents * sizeof(xfs_bmbt_rec_t); - ifp = XFS_IFORK_PTR(ip, whichfork); - /* - * We know that the size is valid (it's checked in iformat_btree) - */ - ifp->if_lastex = NULLEXTNUM; - ifp->if_bytes = ifp->if_real_bytes = 0; - ifp->if_flags |= XFS_IFEXTENTS; - xfs_iext_add(ifp, 0, nextents); - error = xfs_bmap_read_extents(tp, ip, whichfork); - if (error) { - xfs_iext_destroy(ifp); - ifp->if_flags &= ~XFS_IFEXTENTS; - return error; - } - xfs_validate_extents(ifp, nextents, 0, XFS_EXTFMT_INODE(ip)); - return 0; +out_free_name: + if (ci_name) + kmem_free(ci_name->name); +out: + *ipp = NULL; + return error; } /* @@ -1053,16 +613,16 @@ xfs_iread_extents( * set according to the contents of the given cred structure. * * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc() - * has a free inode available, call xfs_iget() - * to obtain the in-core version of the allocated inode. Finally, - * fill in the inode and log its initial contents. In this case, - * ialloc_context would be set to NULL and call_again set to false. + * has a free inode available, call xfs_iget() to obtain the in-core + * version of the allocated inode. Finally, fill in the inode and + * log its initial contents. In this case, ialloc_context would be + * set to NULL. * - * If xfs_dialloc() does not have an available inode, - * it will replenish its supply by doing an allocation. Since we can - * only do one allocation within a transaction without deadlocks, we - * must commit the current transaction before returning the inode itself. - * In this case, therefore, we will set call_again to true and return. + * If xfs_dialloc() does not have an available inode, it will replenish + * its supply by doing an allocation. Since we can only do one + * allocation within a transaction without deadlocks, we must commit + * the current transaction before returning the inode itself. + * In this case, therefore, we will set ialloc_context and return. * The caller should then commit the current transaction, start a new * transaction, and call xfs_ialloc() again to actually get the inode. * @@ -1071,37 +631,40 @@ xfs_iread_extents( * also returns the [locked] bp pointing to the head of the freelist * as ialloc_context. The caller should hold this buffer across * the commit and pass it back into this routine on the second call. + * + * If we are allocating quota inodes, we do not have a parent inode + * to attach to or associate with (i.e. pip == NULL) because they + * are not linked into the directory structure - they are attached + * directly to the superblock - and so have no parent. */ int xfs_ialloc( xfs_trans_t *tp, xfs_inode_t *pip, - mode_t mode, + umode_t mode, xfs_nlink_t nlink, xfs_dev_t rdev, - cred_t *cr, - xfs_prid_t prid, + prid_t prid, int okalloc, xfs_buf_t **ialloc_context, - boolean_t *call_again, xfs_inode_t **ipp) { + struct xfs_mount *mp = tp->t_mountp; xfs_ino_t ino; xfs_inode_t *ip; - bhv_vnode_t *vp; uint flags; int error; + timespec_t tv; /* * Call the space management code to pick * the on-disk inode to be allocated. */ - error = xfs_dialloc(tp, pip->i_ino, mode, okalloc, - ialloc_context, call_again, &ino); - if (error != 0) { + error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc, + ialloc_context, &ino); + if (error) return error; - } - if (*call_again || ino == NULLFSINO) { + if (*ialloc_context || ino == NULLFSINO) { *ipp = NULL; return 0; } @@ -1112,47 +675,32 @@ xfs_ialloc( * This is because we're setting fields here we need * to prevent others from looking at until we're done. */ - error = xfs_trans_iget(tp->t_mountp, tp, ino, - IGET_CREATE, XFS_ILOCK_EXCL, &ip); - if (error != 0) { + error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, + XFS_ILOCK_EXCL, &ip); + if (error) return error; - } ASSERT(ip != NULL); - vp = XFS_ITOV(ip); - ip->i_d.di_mode = (__uint16_t)mode; + /* + * We always convert v1 inodes to v2 now - we only support filesystems + * with >= v2 inode capability, so there is no reason for ever leaving + * an inode in v1 format. + */ + if (ip->i_d.di_version == 1) + ip->i_d.di_version = 2; + + ip->i_d.di_mode = mode; ip->i_d.di_onlink = 0; ip->i_d.di_nlink = nlink; ASSERT(ip->i_d.di_nlink == nlink); - ip->i_d.di_uid = current_fsuid(cr); - ip->i_d.di_gid = current_fsgid(cr); - ip->i_d.di_projid = prid; + ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid()); + ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid()); + xfs_set_projid(ip, prid); memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - /* - * If the superblock version is up to where we support new format - * inodes and this is currently an old format inode, then change - * the inode version number now. This way we only do the conversion - * here rather than here and in the flush/logging code. - */ - if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) && - ip->i_d.di_version == XFS_DINODE_VERSION_1) { - ip->i_d.di_version = XFS_DINODE_VERSION_2; - /* - * We've already zeroed the old link count, the projid field, - * and the pad field. - */ - } - - /* - * Project ids won't be stored on disk if we are using a version 1 inode. - */ - if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1)) - xfs_bump_ino_vers2(tp, ip); - - if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { + if (pip && XFS_INHERIT_GID(pip)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) { ip->i_d.di_mode |= S_ISGID; } } @@ -1164,14 +712,20 @@ xfs_ialloc( */ if ((irix_sgid_inherit) && (ip->i_d.di_mode & S_ISGID) && - (!in_group_p((gid_t)ip->i_d.di_gid))) { + (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) { ip->i_d.di_mode &= ~S_ISGID; } ip->i_d.di_size = 0; ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD); + + nanotime(&tv); + ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; + ip->i_d.di_atime = ip->i_d.di_mtime; + ip->i_d.di_ctime = ip->i_d.di_mtime; + /* * di_gen will have been taken care of in xfs_iread. */ @@ -1179,6 +733,19 @@ xfs_ialloc( ip->i_d.di_dmevmask = 0; ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; + + if (ip->i_d.di_version == 3) { + ASSERT(ip->i_d.di_ino == ino); + ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid)); + ip->i_d.di_crc = 0; + ip->i_d.di_changecount = 1; + ip->i_d.di_lsn = 0; + ip->i_d.di_flags2 = 0; + memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2)); + ip->i_d.di_crtime = ip->i_d.di_mtime; + } + + flags = XFS_ILOG_CORE; switch (mode & S_IFMT) { case S_IFIFO: @@ -1192,21 +759,19 @@ xfs_ialloc( break; case S_IFREG: case S_IFDIR: - if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) { + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { uint di_flags = 0; - if ((mode & S_IFMT) == S_IFDIR) { + if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } - } else if ((mode & S_IFMT) == S_IFREG) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) { + } else if (S_ISREG(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; - ip->i_iocore.io_flags |= XFS_IOCORE_RT; - } if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; ip->i_d.di_extsize = pip->i_d.di_extsize; @@ -1229,6 +794,8 @@ xfs_ialloc( if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; ip->i_d.di_flags |= di_flags; } /* FALLTHROUGH */ @@ -1250,397 +817,708 @@ xfs_ialloc( /* * Log the new values stuffed into the inode. */ + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); /* now that we have an i_mode we can setup inode ops and unlock */ - bhv_vfs_init_vnode(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1); + xfs_setup_inode(ip); *ipp = ip; return 0; } /* - * Check to make sure that there are no blocks allocated to the - * file beyond the size of the file. We don't check this for - * files with fixed size extents or real time extents, but we - * at least do it for regular files. + * Allocates a new inode from disk and return a pointer to the + * incore copy. This routine will internally commit the current + * transaction and allocate a new one if the Space Manager needed + * to do an allocation to replenish the inode free-list. + * + * This routine is designed to be called from xfs_create and + * xfs_create_dir. + * */ -#ifdef DEBUG -void -xfs_isize_check( - xfs_mount_t *mp, - xfs_inode_t *ip, - xfs_fsize_t isize) -{ - xfs_fileoff_t map_first; - int nimaps; - xfs_bmbt_irec_t imaps[2]; +int +xfs_dir_ialloc( + xfs_trans_t **tpp, /* input: current transaction; + output: may be a new transaction. */ + xfs_inode_t *dp, /* directory within whose allocate + the inode. */ + umode_t mode, + xfs_nlink_t nlink, + xfs_dev_t rdev, + prid_t prid, /* project id */ + int okalloc, /* ok to allocate new space */ + xfs_inode_t **ipp, /* pointer to inode; it will be + locked. */ + int *committed) - if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) - return; +{ + xfs_trans_t *tp; + xfs_trans_t *ntp; + xfs_inode_t *ip; + xfs_buf_t *ialloc_context = NULL; + int code; + void *dqinfo; + uint tflags; + + tp = *tpp; + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + + /* + * xfs_ialloc will return a pointer to an incore inode if + * the Space Manager has an available inode on the free + * list. Otherwise, it will do an allocation and replenish + * the freelist. Since we can only do one allocation per + * transaction without deadlocks, we will need to commit the + * current transaction and start a new one. We will then + * need to call xfs_ialloc again to get the inode. + * + * If xfs_ialloc did an allocation to replenish the freelist, + * it returns the bp containing the head of the freelist as + * ialloc_context. We will hold a lock on it across the + * transaction commit so that no other process can steal + * the inode(s) that we've just allocated. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc, + &ialloc_context, &ip); - if (ip->i_d.di_flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_EXTSIZE)) - return; + /* + * Return an error if we were unable to allocate a new inode. + * This should only happen if we run out of space on disk or + * encounter a disk error. + */ + if (code) { + *ipp = NULL; + return code; + } + if (!ialloc_context && !ip) { + *ipp = NULL; + return XFS_ERROR(ENOSPC); + } - nimaps = 2; - map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); /* - * The filesystem could be shutting down, so bmapi may return - * an error. + * If the AGI buffer is non-NULL, then we were unable to get an + * inode in one operation. We need to commit the current + * transaction and call xfs_ialloc() again. It is guaranteed + * to succeed the second time. */ - if (xfs_bmapi(NULL, ip, map_first, - (XFS_B_TO_FSB(mp, - (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - - map_first), - XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps, - NULL, NULL)) - return; - ASSERT(nimaps == 1); - ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK); + if (ialloc_context) { + struct xfs_trans_res tres; + + /* + * Normally, xfs_trans_commit releases all the locks. + * We call bhold to hang on to the ialloc_context across + * the commit. Holding this buffer prevents any other + * processes from doing any allocations in this + * allocation group. + */ + xfs_trans_bhold(tp, ialloc_context); + /* + * Save the log reservation so we can use + * them in the next transaction. + */ + tres.tr_logres = xfs_trans_get_log_res(tp); + tres.tr_logcount = xfs_trans_get_log_count(tp); + + /* + * We want the quota changes to be associated with the next + * transaction, NOT this one. So, detach the dqinfo from this + * and attach it to the next transaction. + */ + dqinfo = NULL; + tflags = 0; + if (tp->t_dqinfo) { + dqinfo = (void *)tp->t_dqinfo; + tp->t_dqinfo = NULL; + tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; + tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); + } + + ntp = xfs_trans_dup(tp); + code = xfs_trans_commit(tp, 0); + tp = ntp; + if (committed != NULL) { + *committed = 1; + } + /* + * If we get an error during the commit processing, + * release the buffer that is still held and return + * to the caller. + */ + if (code) { + xfs_buf_relse(ialloc_context); + if (dqinfo) { + tp->t_dqinfo = dqinfo; + xfs_trans_free_dqinfo(tp); + } + *tpp = ntp; + *ipp = NULL; + return code; + } + + /* + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() + */ + xfs_log_ticket_put(tp->t_ticket); + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + code = xfs_trans_reserve(tp, &tres, 0, 0); + + /* + * Re-attach the quota info that we detached from prev trx. + */ + if (dqinfo) { + tp->t_dqinfo = dqinfo; + tp->t_flags |= tflags; + } + + if (code) { + xfs_buf_relse(ialloc_context); + *tpp = ntp; + *ipp = NULL; + return code; + } + xfs_trans_bjoin(tp, ialloc_context); + + /* + * Call ialloc again. Since we've locked out all + * other allocations in this allocation group, + * this call should always succeed. + */ + code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, + okalloc, &ialloc_context, &ip); + + /* + * If we get an error at this point, return to the caller + * so that the current transaction can be aborted. + */ + if (code) { + *tpp = tp; + *ipp = NULL; + return code; + } + ASSERT(!ialloc_context && ip); + + } else { + if (committed != NULL) + *committed = 0; + } + + *ipp = ip; + *tpp = tp; + + return 0; } -#endif /* DEBUG */ /* - * Calculate the last possible buffered byte in a file. This must - * include data that was buffered beyond the EOF by the write code. - * This also needs to deal with overflowing the xfs_fsize_t type - * which can happen for sizes near the limit. - * - * We also need to take into account any blocks beyond the EOF. It - * may be the case that they were buffered by a write which failed. - * In that case the pages will still be in memory, but the inode size - * will never have been updated. + * Decrement the link count on an inode & log the change. + * If this causes the link count to go to zero, initiate the + * logging activity required to truncate a file. */ -xfs_fsize_t -xfs_file_last_byte( - xfs_inode_t *ip) +int /* error */ +xfs_droplink( + xfs_trans_t *tp, + xfs_inode_t *ip) { - xfs_mount_t *mp; - xfs_fsize_t last_byte; - xfs_fileoff_t last_block; - xfs_fileoff_t size_last_block; - int error; + int error; - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT (ip->i_d.di_nlink > 0); + ip->i_d.di_nlink--; + drop_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = 0; + if (ip->i_d.di_nlink == 0) { + /* + * We're dropping the last link to this file. + * Move the on-disk inode to the AGI unlinked list. + * From xfs_inactive() we will pull the inode from + * the list and free it. + */ + error = xfs_iunlink(tp, ip); + } + return error; +} + +/* + * Increment the link count on an inode & log the change. + */ +int +xfs_bumplink( + xfs_trans_t *tp, + xfs_inode_t *ip) +{ + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + + ASSERT(ip->i_d.di_version > 1); + ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE)); + ip->i_d.di_nlink++; + inc_nlink(VFS_I(ip)); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return 0; +} + +int +xfs_create( + xfs_inode_t *dp, + struct xfs_name *name, + umode_t mode, + xfs_dev_t rdev, + xfs_inode_t **ipp) +{ + int is_dir = S_ISDIR(mode); + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res tres; + uint resblks; + + trace_xfs_create(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); - mp = ip->i_mount; /* - * Only check for blocks beyond the EOF if the extents have - * been read in. This eliminates the need for the inode lock, - * and it also saves us from looking when it really isn't - * necessary. + * Make sure that we have allocated dquot(s) on disk. */ - if (ip->i_df.if_flags & XFS_IFEXTENTS) { - error = xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); - if (error) { - last_block = 0; - } + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + if (is_dir) { + rdev = 0; + resblks = XFS_MKDIR_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres; + tres.tr_logcount = XFS_MKDIR_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); } else { - last_block = 0; + resblks = XFS_CREATE_SPACE_RES(mp, name->len); + tres.tr_logres = M_RES(mp)->tr_create.tr_logres; + tres.tr_logcount = XFS_CREATE_LOG_COUNT; + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size); - last_block = XFS_FILEOFF_MAX(last_block, size_last_block); - last_byte = XFS_FSB_TO_B(mp, last_block); - if (last_byte < 0) { - return XFS_MAXIOFFSET(mp); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + + /* + * Initially assume that the file does not exist and + * reserve the resources for that case. If that is not + * the case we'll drop the one we have and get a more + * appropriate transaction later. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(tp, &tres, resblks, 0); + if (error == ENOSPC) { + /* flush outstanding delalloc blocks and retry */ + xfs_flush_inodes(mp); + error = xfs_trans_reserve(tp, &tres, resblks, 0); } - last_byte += (1 << mp->m_writeio_log); - if (last_byte < 0) { - return XFS_MAXIOFFSET(mp); + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, &tres, 0, 0); + } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; } - return last_byte; -} -#if defined(XFS_RW_TRACE) -STATIC void -xfs_itrunc_trace( - int tag, - xfs_inode_t *ip, - int flag, - xfs_fsize_t new_size, - xfs_off_t toss_start, - xfs_off_t toss_finish) -{ - if (ip->i_rwtrace == NULL) { - return; + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + xfs_bmap_init(&free_list, &first_block); + + /* + * Reserve disk quota and the inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_canenter(tp, dp, name, resblks); + if (error) + goto out_trans_cancel; + + /* + * A newly created regular or special file just has one directory + * entry pointing to them, but a directory also the "." entry + * pointing to itself. + */ + error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, resblks > 0, &ip, &committed); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; } - ktrace_enter(ip->i_rwtrace, - (void*)((long)tag), - (void*)ip, - (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff), - (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff), - (void*)((long)flag), - (void*)(unsigned long)((new_size >> 32) & 0xffffffff), - (void*)(unsigned long)(new_size & 0xffffffff), - (void*)(unsigned long)((toss_start >> 32) & 0xffffffff), - (void*)(unsigned long)(toss_start & 0xffffffff), - (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff), - (void*)(unsigned long)(toss_finish & 0xffffffff), - (void*)(unsigned long)current_cpu(), - (void*)(unsigned long)current_pid(), - (void*)NULL, - (void*)NULL, - (void*)NULL); -} -#else -#define xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish) -#endif + /* + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; -/* - * Start the truncation of the file to new_size. The new size - * must be smaller than the current size. This routine will - * clear the buffer and page caches of file data in the removed - * range, and xfs_itruncate_finish() will remove the underlying - * disk blocks. - * - * The inode must have its I/O lock locked EXCLUSIVELY, and it - * must NOT have the inode lock held at all. This is because we're - * calling into the buffer/page cache code and we can't hold the - * inode lock when we do so. - * - * We need to wait for any direct I/Os in flight to complete before we - * proceed with the truncate. This is needed to prevent the extents - * being read or written by the direct I/Os from being removed while the - * I/O is in flight as there is no other method of synchronising - * direct I/O with the truncate operation. Also, because we hold - * the IOLOCK in exclusive mode, we prevent new direct I/Os from being - * started until the truncate completes and drops the lock. Essentially, - * the vn_iowait() call forms an I/O barrier that provides strict ordering - * between direct I/Os and the truncate operation. - * - * The flags parameter can have either the value XFS_ITRUNC_DEFINITE - * or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used - * in the case that the caller is locking things out of order and - * may not be able to call xfs_itruncate_finish() with the inode lock - * held without dropping the I/O lock. If the caller must drop the - * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start() - * must be called again with all the same restrictions as the initial - * call. - */ -void -xfs_itruncate_start( - xfs_inode_t *ip, - uint flags, - xfs_fsize_t new_size) -{ - xfs_fsize_t last_byte; - xfs_off_t toss_start; - xfs_mount_t *mp; - bhv_vnode_t *vp; + error = xfs_dir_createname(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks ? + resblks - XFS_IALLOC_SPACE_RES(mp) : 0); + if (error) { + ASSERT(error != ENOSPC); + goto out_trans_abort; + } + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + if (is_dir) { + error = xfs_dir_init(tp, ip, dp); + if (error) + goto out_bmap_cancel; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); - ASSERT((flags == XFS_ITRUNC_DEFINITE) || - (flags == XFS_ITRUNC_MAYBE)); + error = xfs_bumplink(tp, dp); + if (error) + goto out_bmap_cancel; + } - mp = ip->i_mount; - vp = XFS_ITOV(ip); + /* + * If this is a synchronous mount, make sure that the + * create transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ - /* - * Call toss_pages or flushinval_pages to get rid of pages - * overlapping the region being removed. We have to use - * the less efficient flushinval_pages in the case that the - * caller may not be able to finish the truncate without - * dropping the inode's I/O lock. Make sure - * to catch any pages brought in by buffers overlapping - * the EOF by searching out beyond the isize by our - * block size. We round new_size up to a block boundary - * so that we don't toss things on the same block as - * new_size but before it. - * - * Before calling toss_page or flushinval_pages, make sure to - * call remapf() over the same region if the file is mapped. - * This frees up mapped file references to the pages in the - * given range and for the flushinval_pages case it ensures - * that we get the latest mapped changes flushed out. + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. */ - toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - toss_start = XFS_FSB_TO_B(mp, toss_start); - if (toss_start < 0) { - /* - * The place to start tossing is beyond our maximum - * file size, so there is no way that the data extended - * out there. - */ - return; + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + return error; +} + +int +xfs_create_tmpfile( + struct xfs_inode *dp, + struct dentry *dentry, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_inode *ip = NULL; + struct xfs_trans *tp = NULL; + int error; + uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_trans_res *tres; + uint resblks; + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + return error; + + resblks = XFS_IALLOC_SPACE_RES(mp); + tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE); + + tres = &M_RES(mp)->tr_create_tmpfile; + error = xfs_trans_reserve(tp, tres, resblks, 0); + if (error == ENOSPC) { + /* No space at all so try a "no-allocation" reservation */ + resblks = 0; + error = xfs_trans_reserve(tp, tres, 0, 0); } - last_byte = xfs_file_last_byte(ip); - xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start, - last_byte); - if (last_byte > toss_start) { - if (flags & XFS_ITRUNC_DEFINITE) { - bhv_vop_toss_pages(vp, toss_start, -1, FI_REMAPF_LOCKED); - } else { - bhv_vop_flushinval_pages(vp, toss_start, -1, FI_REMAPF_LOCKED); - } + if (error) { + cancel_flags = 0; + goto out_trans_cancel; } -#ifdef DEBUG - if (new_size == 0) { - ASSERT(VN_CACHED(vp) == 0); + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto out_trans_cancel; + + error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto out_trans_cancel; + goto out_trans_abort; } -#endif + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + /* + * Attach the dquot(s) to the inodes and modify them incore. + * These ids of the inode couldn't have changed since the new + * inode has been locked ever since it was created. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + ip->i_d.di_nlink--; + error = xfs_iunlink(tp, ip); + if (error) + goto out_trans_abort; + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_release_inode; + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + out_trans_abort: + cancel_flags |= XFS_TRANS_ABORT; + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + out_release_inode: + /* + * Wait until after the current transaction is aborted to + * release the inode. This prevents recursive transactions + * and deadlocks from xfs_inactive. + */ + if (ip) + IRELE(ip); + + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + return error; } -/* - * Shrink the file to the given new_size. The new - * size must be smaller than the current size. - * This will free up the underlying blocks - * in the removed range after a call to xfs_itruncate_start() - * or xfs_atruncate_start(). - * - * The transaction passed to this routine must have made - * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. - * This routine may commit the given transaction and - * start new ones, so make sure everything involved in - * the transaction is tidy before calling here. - * Some transaction will be returned to the caller to be - * committed. The incoming transaction must already include - * the inode, and both inode locks must be held exclusively. - * The inode must also be "held" within the transaction. On - * return the inode will be "held" within the returned transaction. - * This routine does NOT require any disk space to be reserved - * for it within the transaction. - * - * The fork parameter must be either xfs_attr_fork or xfs_data_fork, - * and it indicates the fork which is to be truncated. For the - * attribute fork we only support truncation to size 0. - * - * We use the sync parameter to indicate whether or not the first - * transaction we perform might have to be synchronous. For the attr fork, - * it needs to be so if the unlink of the inode is not yet known to be - * permanent in the log. This keeps us from freeing and reusing the - * blocks of the attribute fork before the unlink of the inode becomes - * permanent. - * - * For the data fork, we normally have to run synchronously if we're - * being called out of the inactive path or we're being called - * out of the create path where we're truncating an existing file. - * Either way, the truncate needs to be sync so blocks don't reappear - * in the file with altered data in case of a crash. wsync filesystems - * can run the first case async because anything that shrinks the inode - * has to run sync so by the time we're called here from inactive, the - * inode size is permanently set to 0. - * - * Calls from the truncate path always need to be sync unless we're - * in a wsync filesystem and the file has already been unlinked. - * - * The caller is responsible for correctly setting the sync parameter. - * It gets too hard for us to guess here which path we're being called - * out of just based on inode state. - */ int -xfs_itruncate_finish( - xfs_trans_t **tp, - xfs_inode_t *ip, - xfs_fsize_t new_size, - int fork, - int sync) +xfs_link( + xfs_inode_t *tdp, + xfs_inode_t *sip, + struct xfs_name *target_name) { - xfs_fsblock_t first_block; - xfs_fileoff_t first_unmap_block; - xfs_fileoff_t last_block; - xfs_filblks_t unmap_len=0; - xfs_mount_t *mp; - xfs_trans_t *ntp; - int done; - int committed; - xfs_bmap_free_t free_list; - int error; + xfs_mount_t *mp = tdp->i_mount; + xfs_trans_t *tp; + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int resblks; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); - ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size)); - ASSERT(*tp != NULL); - ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - ASSERT(ip->i_transp == *tp); - ASSERT(ip->i_itemp != NULL); - ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD); + trace_xfs_link(tdp, target_name); + + ASSERT(!S_ISDIR(sip->i_d.di_mode)); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); - ntp = *tp; - mp = (ntp)->t_mountp; - ASSERT(! XFS_NOT_DQATTACHED(mp, ip)); + error = xfs_qm_dqattach(sip, 0); + if (error) + goto std_return; + + error = xfs_qm_dqattach(tdp, 0); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + resblks = XFS_LINK_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); + + xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* - * We only support truncating the entire attribute fork. + * If we are using project inheritance, we only allow hard link + * creation in our tree when the project IDs are the same; else + * the tree quota mechanism could be circumvented. */ - if (fork == XFS_ATTR_FORK) { - new_size = 0LL; + if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; } - first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); - xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0); + + error = xfs_dir_canenter(tp, tdp, target_name, resblks); + if (error) + goto error_return; + + xfs_bmap_init(&free_list, &first_block); + + if (sip->i_d.di_nlink == 0) { + error = xfs_iunlink_remove(tp, sip); + if (error) + goto abort_return; + } + + error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto abort_return; + xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); + + error = xfs_bumplink(tp, sip); + if (error) + goto abort_return; + /* - * The first thing we do is set the size to new_size permanently - * on disk. This way we don't have to worry about anyone ever - * being able to look at the data being freed even in the face - * of a crash. What we're getting around here is the case where - * we free a block, it is allocated to another file, it is written - * to, and then we crash. If the new data gets written to the - * file but the log buffers containing the free and reallocation - * don't, then we'd end up with garbage in the blocks being freed. - * As long as we make the new_size permanent before actually - * freeing any blocks it doesn't matter if they get writtten to. - * - * The callers must signal into us whether or not the size - * setting here must be synchronous. There are a few cases - * where it doesn't have to be synchronous. Those cases - * occur if the file is unlinked and we know the unlink is - * permanent or if the blocks being truncated are guaranteed - * to be beyond the inode eof (regardless of the link count) - * and the eof value is permanent. Both of these cases occur - * only on wsync-mounted filesystems. In those cases, we're - * guaranteed that no user will ever see the data in the blocks - * that are being truncated so the truncate can run async. - * In the free beyond eof case, the file may wind up with - * more blocks allocated to it than it needs if we crash - * and that won't get fixed until the next time the file - * is re-opened and closed but that's ok as that shouldn't - * be too many blocks. - * - * However, we can't just make all wsync xactions run async - * because there's one call out of the create path that needs - * to run sync where it's truncating an existing file to size - * 0 whose size is > 0. - * - * It's probably possible to come up with a test in this - * routine that would correctly distinguish all the above - * cases from the values of the function parameters and the - * inode state but for sanity's sake, I've decided to let the - * layers above just tell us. It's simpler to correctly figure - * out in the layer above exactly under what conditions we - * can run async and I think it's easier for others read and - * follow the logic in case something has to be changed. - * cscope is your friend -- rcc. - * - * The attribute fork is much simpler. - * - * For the attribute fork we allow the caller to tell us whether - * the unlink of the inode that led to this call is yet permanent - * in the on disk log. If it is not and we will be freeing extents - * in this inode then we make the first transaction synchronous - * to make sure that the unlink is permanent by the time we free - * the blocks. + * If this is a synchronous mount, make sure that the + * link transaction goes to disk before returning to + * the user. */ - if (fork == XFS_DATA_FORK) { - if (ip->i_d.di_nextents > 0) { - ip->i_d.di_size = new_size; - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); - } - } else if (sync) { - ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC)); - if (ip->i_d.di_anextents > 0) - xfs_trans_set_sync(ntp); + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish (&tp, &free_list, &committed); + if (error) { + xfs_bmap_cancel(&free_list); + goto abort_return; } - ASSERT(fork == XFS_DATA_FORK || - (fork == XFS_ATTR_FORK && - ((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) || - (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC))))); + + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} + +/* + * Free up the underlying blocks past new_size. The new size must be smaller + * than the current size. This routine can be used both for the attribute and + * data fork, and does not modify the inode size, which is left to the caller. + * + * The transaction passed to this routine must have made a permanent log + * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the + * given transaction and start new ones, so make sure everything involved in + * the transaction is tidy before calling here. Some transaction will be + * returned to the caller to be committed. The incoming transaction must + * already include the inode, and both inode locks must be held exclusively. + * The inode must also be "held" within the transaction. On return the inode + * will be "held" within the returned transaction. This routine does NOT + * require any disk space to be reserved for it within the transaction. + * + * If we get an error, we must return with the inode locked and linked into the + * current transaction. This keeps things simple for the higher level code, + * because it always knows that the inode is locked and held in the transaction + * that returns to it whether errors occur or not. We don't mark the inode + * dirty on error so that transactions can be easily aborted if possible. + */ +int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp = *tpp; + struct xfs_trans *ntp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + xfs_fileoff_t first_unmap_block; + xfs_fileoff_t last_block; + xfs_filblks_t unmap_len; + int committed; + int error = 0; + int done = 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!atomic_read(&VFS_I(ip)->i_count) || + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(new_size <= XFS_ISIZE(ip)); + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(ip->i_itemp != NULL); + ASSERT(ip->i_itemp->ili_lock_flags == 0); + ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); + + trace_xfs_itruncate_extents_start(ip, new_size); /* * Since it is possible for space to become allocated beyond @@ -1651,200 +1529,395 @@ xfs_itruncate_finish( * beyond the maximum file size (ie it is the same as last_block), * then there is nothing to do. */ - last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - ASSERT(first_unmap_block <= last_block); - done = 0; - if (last_block == first_unmap_block) { - done = 1; - } else { - unmap_len = last_block - first_unmap_block + 1; - } + first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); + last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + if (first_unmap_block == last_block) + return 0; + + ASSERT(first_unmap_block < last_block); + unmap_len = last_block - first_unmap_block + 1; while (!done) { - /* - * Free up up to XFS_ITRUNC_MAX_EXTENTS. xfs_bunmapi() - * will tell us whether it freed the entire range or - * not. If this is a synchronous mount (wsync), - * then we can tell bunmapi to keep all the - * transactions asynchronous since the unlink - * transaction that made this inode inactive has - * already hit the disk. There's no danger of - * the freed blocks being reused, there being a - * crash, and the reused blocks suddenly reappearing - * in this file with garbage in them once recovery - * runs. - */ - XFS_BMAP_INIT(&free_list, &first_block); - error = XFS_BUNMAPI(mp, ntp, &ip->i_iocore, + xfs_bmap_init(&free_list, &first_block); + error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, - XFS_BMAPI_AFLAG(fork) | - (sync ? 0 : XFS_BMAPI_ASYNC), + xfs_bmapi_aflag(whichfork), XFS_ITRUNC_MAX_EXTENTS, &first_block, &free_list, - NULL, &done); - if (error) { - /* - * If the bunmapi call encounters an error, - * return to the caller where the transaction - * can be properly aborted. We just need to - * make sure we're not holding any resources - * that we were not when we came in. - */ - xfs_bmap_cancel(&free_list); - return error; - } + &done); + if (error) + goto out_bmap_cancel; /* * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(tp, &free_list, first_block, - &committed); - ntp = *tp; - if (error) { - /* - * If the bmap finish call encounters an error, - * return to the caller where the transaction - * can be properly aborted. We just need to - * make sure we're not holding any resources - * that we were not when we came in. - * - * Aborting from this point might lose some - * blocks in the file system, but oh well. - */ - xfs_bmap_cancel(&free_list); - if (committed) { - /* - * If the passed in transaction committed - * in xfs_bmap_finish(), then we want to - * add the inode to this one before returning. - * This keeps things simple for the higher - * level code, because it always knows that - * the inode is locked and held in the - * transaction that returns to it whether - * errors occur or not. We don't mark the - * inode dirty so that this transaction can - * be easily aborted if possible. - */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - } - return error; - } + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (committed) + xfs_trans_ijoin(tp, ip, 0); + if (error) + goto out_bmap_cancel; if (committed) { /* - * The first xact was committed, - * so add the inode to the new one. - * Mark it dirty so it will be logged - * and moved forward in the log as - * part of every commit. + * Mark the inode dirty so it will be logged and + * moved forward in the log as part of every commit. */ - xfs_trans_ijoin(ntp, ip, - XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - ntp = xfs_trans_dup(ntp); - (void) xfs_trans_commit(*tp, 0, NULL); - *tp = ntp; - error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); + + ntp = xfs_trans_dup(tp); + error = xfs_trans_commit(tp, 0); + tp = ntp; + + xfs_trans_ijoin(tp, ip, 0); + + if (error) + goto out; + /* - * Add the inode being truncated to the next chained - * transaction. + * Transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() */ - xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(ntp, ip); + xfs_log_ticket_put(tp->t_ticket); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) - return (error); + goto out; } + + /* + * Always re-log the inode so that our permanent transaction can keep + * on rolling it forward in the log. + */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + trace_xfs_itruncate_extents_end(ip, new_size); + +out: + *tpp = tp; + return error; +out_bmap_cancel: /* - * Only update the size in the case of the data fork, but - * always re-log the inode so that our permanent transaction - * can keep on rolling it forward in the log. + * If the bunmapi call encounters an error, return to the caller where + * the transaction can be properly aborted. We just need to make sure + * we're not holding any resources that we were not when we came in. */ - if (fork == XFS_DATA_FORK) { - xfs_isize_check(mp, ip, new_size); - ip->i_d.di_size = new_size; + xfs_bmap_cancel(&free_list); + goto out; +} + +int +xfs_release( + xfs_inode_t *ip) +{ + xfs_mount_t *mp = ip->i_mount; + int error; + + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) + return 0; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return 0; + + if (!XFS_FORCED_SHUTDOWN(mp)) { + int truncated; + + /* + * If we previously truncated this file and removed old data + * in the process, we want to initiate "early" writeout on + * the last close. This is an attempt to combat the notorious + * NULL files problem which is particularly noticeable from a + * truncate down, buffered (re-)write (delalloc), followed by + * a crash. What we are effectively doing here is + * significantly reducing the time window where we'd otherwise + * be exposed to that problem. + */ + truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); + if (truncated) { + xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); + if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { + error = -filemap_flush(VFS_I(ip)->i_mapping); + if (error) + return error; + } + } + } + + if (ip->i_d.di_nlink == 0) + return 0; + + if (xfs_can_free_eofblocks(ip, false)) { + + /* + * If we can't get the iolock just skip truncating the blocks + * past EOF because we could deadlock with the mmap_sem + * otherwise. We'll get another chance to drop them once the + * last reference to the inode is dropped, so we'll never leak + * blocks permanently. + * + * Further, check if the inode is being opened, written and + * closed frequently and we have delayed allocation blocks + * outstanding (e.g. streaming writes from the NFS server), + * truncating the blocks past EOF will cause fragmentation to + * occur. + * + * In this case don't do the truncation, either, but we have to + * be careful how we detect this case. Blocks beyond EOF show + * up as i_delayed_blks even when the inode is clean, so we + * need to truncate them away first before checking for a dirty + * release. Hence on the first dirty close we will still remove + * the speculative allocation, but after that we will leave it + * in place. + */ + if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) + return 0; + + error = xfs_free_eofblocks(mp, ip, true); + if (error && error != EAGAIN) + return error; + + /* delalloc blocks after truncation means it really is dirty */ + if (ip->i_delayed_blks) + xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); } - xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); - ASSERT((new_size != 0) || - (fork == XFS_ATTR_FORK) || - (ip->i_delayed_blks == 0)); - ASSERT((new_size != 0) || - (fork == XFS_ATTR_FORK) || - (ip->i_d.di_nextents == 0)); - xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0); return 0; } - /* - * xfs_igrow_start + * xfs_inactive_truncate * - * Do the first part of growing a file: zero any data in the last - * block that is beyond the old EOF. We need to do this before - * the inode is joined to the transaction to modify the i_size. - * That way we can drop the inode lock and call into the buffer - * cache to get the buffer mapping the EOF. + * Called to perform a truncate when an inode becomes unlinked. */ -int -xfs_igrow_start( - xfs_inode_t *ip, - xfs_fsize_t new_size, - cred_t *credp) +STATIC int +xfs_inactive_truncate( + struct xfs_inode *ip) { - int error; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + xfs_trans_cancel(tp, 0); + return error; + } - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); - ASSERT(new_size > ip->i_d.di_size); + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); /* - * Zero any pages that may have been created by - * xfs_write_file() beyond the end of the file - * and any blocks between the old and new file sizes. + * Log the inode size first to prevent stale data exposure in the event + * of a system crash before the truncate completes. See the related + * comment in xfs_setattr_size() for details. */ - error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, - ip->i_d.di_size, new_size); + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) + goto error_trans_cancel; + + ASSERT(ip->i_d.di_nextents == 0); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error_unlock; + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } /* - * xfs_igrow_finish + * xfs_inactive_ifree() + * + * Perform the inode free when an inode is unlinked. + */ +STATIC int +xfs_inactive_ifree( + struct xfs_inode *ip) +{ + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + + /* + * The ifree transaction might need to allocate blocks for record + * insertion to the finobt. We don't want to fail here at ENOSPC, so + * allow ifree to dip into the reserved block pool if necessary. + * + * Freeing large sets of inodes generally means freeing inode chunks, + * directory and file data blocks, so this should be relatively safe. + * Only under severe circumstances should it be possible to free enough + * inodes to exhaust the reserve block pool via finobt expansion while + * at the same time not creating free space in the filesystem. + * + * Send a warning if the reservation does happen to fail, as the inode + * now remains allocated and sits on the unlinked list until the fs is + * repaired. + */ + tp->t_flags |= XFS_TRANS_RESERVE; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0); + if (error) { + if (error == ENOSPC) { + xfs_warn_ratelimited(mp, + "Failed to remove inode(s) from unlinked list. " + "Please free space, unmount and run xfs_repair."); + } else { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + xfs_bmap_init(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); + if (error) { + /* + * If we fail to free the inode, shut down. The cancel + * might do that, we need to make sure. Otherwise the + * inode might be lost for a long time or forever. + */ + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_notice(mp, "%s: xfs_ifree returned error %d", + __func__, error); + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + } + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + /* + * Credit the quota account(s). The inode is gone. + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + + /* + * Just ignore errors at this point. There is nothing we can + * do except to try to keep going. Make sure it's not a silent + * error. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", + __func__, error); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + xfs_notice(mp, "%s: xfs_trans_commit returned error %d", + __func__, error); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + +/* + * xfs_inactive * - * This routine is called to extend the size of a file. - * The inode must have both the iolock and the ilock locked - * for update and it must be a part of the current transaction. - * The xfs_igrow_start() function must have been called previously. - * If the change_flag is not zero, the inode change timestamp will - * be updated. + * This is called when the vnode reference count for the vnode + * goes to zero. If the file has been unlinked, then it must + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. */ void -xfs_igrow_finish( - xfs_trans_t *tp, - xfs_inode_t *ip, - xfs_fsize_t new_size, - int change_flag) +xfs_inactive( + xfs_inode_t *ip) { - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); - ASSERT(ip->i_transp == tp); - ASSERT(new_size > ip->i_d.di_size); + struct xfs_mount *mp; + int error; + int truncate = 0; /* - * Update the file size. Update the inode change timestamp - * if change_flag set. + * If the inode is already free, then there can be nothing + * to clean up here. */ - ip->i_d.di_size = new_size; - if (change_flag) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (ip->i_d.di_mode == 0) { + ASSERT(ip->i_df.if_real_bytes == 0); + ASSERT(ip->i_df.if_broot_bytes == 0); + return; + } -} + mp = ip->i_mount; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (mp->m_flags & XFS_MOUNT_RDONLY) + return; + + if (ip->i_d.di_nlink != 0) { + /* + * force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with + * broken free space accounting. + */ + if (xfs_can_free_eofblocks(ip, true)) + xfs_free_eofblocks(mp, ip, false); + + return; + } + if (S_ISREG(ip->i_d.di_mode) && + (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 || + ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + + error = xfs_qm_dqattach(ip, 0); + if (error) + return; + + if (S_ISLNK(ip->i_d.di_mode)) + error = xfs_inactive_symlink(ip); + else if (truncate) + error = xfs_inactive_truncate(ip); + if (error) + return; + + /* + * If there are attributes associated with the file then blow them away + * now. The code calls a routine that recursively deconstructs the + * attribute fork. We need to just commit the current transaction + * because we can't use it for xfs_attr_inactive(). + */ + if (ip->i_d.di_anextents > 0) { + ASSERT(ip->i_d.di_forkoff != 0); + + error = xfs_attr_inactive(ip); + if (error) + return; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + + ASSERT(ip->i_d.di_anextents == 0); + + /* + * Free the inode. + */ + error = xfs_inactive_ifree(ip); + if (error) + return; + + /* + * Release the dquots held by inode, if any. + */ + xfs_qm_dqdetach(ip); +} /* * This is called when the inode's link count goes to 0. @@ -1861,45 +1934,25 @@ xfs_iunlink( xfs_dinode_t *dip; xfs_buf_t *agibp; xfs_buf_t *ibp; - xfs_agnumber_t agno; - xfs_daddr_t agdaddr; xfs_agino_t agino; short bucket_index; int offset; int error; - int agi_ok; ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_mode != 0); - ASSERT(ip->i_transp == tp); mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); - agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); - /* * Get the agi buffer first. It ensures lock ordering * on the list. */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, - XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { + error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp); + if (error) return error; - } - /* - * Validate the magic number of the agi block. - */ agi = XFS_BUF_TO_AGI(agibp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK, - XFS_RANDOM_IUNLINK))) { - XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi); - xfs_trans_brelse(tp, agibp); - return XFS_ERROR(EFSCORRUPTED); - } + /* * Get the index into the agi hash table for the * list this inode will go on. @@ -1910,23 +1963,26 @@ xfs_iunlink( ASSERT(agi->agi_unlinked[bucket_index]); ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino); - if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) { + if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) { /* * There is already another inode in the bucket we need * to add ourselves to. Add us at the front of the list. * Here we put the head pointer into our next pointer, * and then we fall through to point the head at us. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); - if (error) { + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); + if (error) return error; - } - ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO); - ASSERT(dip->di_next_unlinked); - /* both on-disk, don't endian flip twice */ + + ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO)); dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; - offset = ip->i_boffset + + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1960,7 +2016,6 @@ xfs_iunlink_remove( xfs_buf_t *agibp; xfs_buf_t *ibp; xfs_agnumber_t agno; - xfs_daddr_t agdaddr; xfs_agino_t agino; xfs_agino_t next_agino; xfs_buf_t *last_ibp; @@ -1968,45 +2023,20 @@ xfs_iunlink_remove( short bucket_index; int offset, last_offset = 0; int error; - int agi_ok; - /* - * First pull the on-disk inode from the AGI unlinked list. - */ mp = tp->t_mountp; - agno = XFS_INO_TO_AGNO(mp, ip->i_ino); - agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); /* * Get the agi buffer first. It ensures lock ordering * on the list. */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr, - XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { - cmn_err(CE_WARN, - "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) return error; - } - /* - * Validate the magic number of the agi block. - */ + agi = XFS_BUF_TO_AGI(agibp); - agi_ok = - be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC && - XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)); - if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE, - XFS_RANDOM_IUNLINK_REMOVE))) { - XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW, - mp, agi); - xfs_trans_brelse(tp, agibp); - cmn_err(CE_WARN, - "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.", - mp->m_fsname); - return XFS_ERROR(EFSCORRUPTED); - } + /* * Get the index into the agi hash table for the * list this inode will go on. @@ -2014,31 +2044,34 @@ xfs_iunlink_remove( agino = XFS_INO_TO_AGINO(mp, ip->i_ino); ASSERT(agino != 0); bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; - ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO); + ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)); ASSERT(agi->agi_unlinked[bucket_index]); if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) { /* - * We're at the head of the list. Get the inode's - * on-disk buffer to see if there is anyone after us - * on the list. Only modify our next pointer if it - * is not already NULLAGINO. This saves us the overhead - * of dealing with the buffer when there is no need to - * change it. + * We're at the head of the list. Get the inode's on-disk + * buffer to see if there is anyone after us on the list. + * Only modify our next pointer if it is not already NULLAGINO. + * This saves us the overhead of dealing with the buffer when + * there is no need to change it. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - cmn_err(CE_WARN, - "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.", + __func__, error); return error; } - next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + next_agino = be32_to_cpu(dip->di_next_unlinked); ASSERT(next_agino != 0); if (next_agino != NULLAGINO) { - INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); - offset = ip->i_boffset + + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -2063,45 +2096,59 @@ xfs_iunlink_remove( next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); last_ibp = NULL; while (next_agino != agino) { - /* - * If the last inode wasn't the one pointing to - * us, then release its buffer since we're not - * going to do anything with it. - */ - if (last_ibp != NULL) { + struct xfs_imap imap; + + if (last_ibp) xfs_trans_brelse(tp, last_ibp); - } + + imap.im_blkno = 0; next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino); - error = xfs_inotobp(mp, tp, next_ino, &last_dip, - &last_ibp, &last_offset); + + error = xfs_imap(mp, tp, next_ino, &imap, 0); + if (error) { + xfs_warn(mp, + "%s: xfs_imap returned error %d.", + __func__, error); + return error; + } + + error = xfs_imap_to_bp(mp, tp, &imap, &last_dip, + &last_ibp, 0, 0); if (error) { - cmn_err(CE_WARN, - "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + xfs_warn(mp, + "%s: xfs_imap_to_bp returned error %d.", + __func__, error); return error; } - next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT); + + last_offset = imap.im_boffset; + next_agino = be32_to_cpu(last_dip->di_next_unlinked); ASSERT(next_agino != NULLAGINO); ASSERT(next_agino != 0); } + /* - * Now last_ibp points to the buffer previous to us on - * the unlinked list. Pull us from the list. + * Now last_ibp points to the buffer previous to us on the + * unlinked list. Pull us from the list. */ - error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, + 0, 0); if (error) { - cmn_err(CE_WARN, - "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", - error, mp->m_fsname); + xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.", + __func__, error); return error; } - next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT); + next_agino = be32_to_cpu(dip->di_next_unlinked); ASSERT(next_agino != 0); ASSERT(next_agino != agino); if (next_agino != NULLAGINO) { - INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO); - offset = ip->i_boffset + + dip->di_next_unlinked = cpu_to_be32(NULLAGINO); + offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -2112,9 +2159,13 @@ xfs_iunlink_remove( /* * Point the previous inode on the list to the next inode. */ - INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino); + last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -2123,14 +2174,12 @@ xfs_iunlink_remove( return 0; } -static __inline__ int xfs_inode_clean(xfs_inode_t *ip) -{ - return (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && - (ip->i_update_core == 0)); -} - -STATIC void +/* + * A big issue when freeing the inode cluster is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ +STATIC int xfs_ifree_cluster( xfs_inode_t *free_ip, xfs_trans_t *tp, @@ -2138,163 +2187,160 @@ xfs_ifree_cluster( { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; + int inodes_per_cluster; int nbufs; - int ninodes; - int i, j, found, pre_flushed; + int i, j; xfs_daddr_t blkno; xfs_buf_t *bp; - xfs_ihash_t *ih; - xfs_inode_t *ip, **ip_found; + xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; - SPLDECL(s); - - if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { - blks_per_cluster = 1; - ninodes = mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp); - } else { - blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / - mp->m_sb.sb_blocksize; - ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; - nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; - } + struct xfs_perag *pag; - ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + nbufs = mp->m_ialloc_blks / blks_per_cluster; - for (j = 0; j < nbufs; j++, inum += ninodes) { + for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); + /* + * We obtain and lock the backing buffer first in the process + * here, as we have to ensure that any dirty inode that we + * can't get the flush lock on is attached to the buffer. + * If we scan the in-memory inodes first, then buffer IO can + * complete before we get a lock on it, and hence we may fail + * to mark all the active inodes on the buffer stale. + */ + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XBF_UNMAPPED); + + if (!bp) + return ENOMEM; /* - * Look for each inode in memory and attempt to lock it, - * we can be racing with flush and tail pushing here. - * any inode we get the locks on, add to an array of - * inode items to process later. - * - * The get the buffer lock, we could beat a flush - * or tail pushing thread to the lock here, in which - * case they will go looking for the inode buffer - * and fail, we need some other form of interlock - * here. + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to + * attach stale cached inodes on it. That means it will never be + * dispatched for IO. If it is, we want to know about it, and we + * want it to fail. We can acheive this by adding a write + * verifier to the buffer. */ - found = 0; - for (i = 0; i < ninodes; i++) { - ih = XFS_IHASH(mp, inum + i); - read_lock(&ih->ih_lock); - for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { - if (ip->i_ino == inum + i) - break; - } + bp->b_ops = &xfs_inode_buf_ops; - /* Inode not in memory or we found it already, - * nothing to do - */ - if (!ip || (ip->i_flags & XFS_ISTALE)) { - read_unlock(&ih->ih_lock); - continue; + /* + * Walk the inodes already attached to the buffer and mark them + * stale. These will all have the flush locks held, so an + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. + */ + lip = bp->b_fspriv; + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = xfs_istale_done; + xfs_trans_ail_copy_lsn(mp->m_ail, + &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + xfs_iflags_set(iip->ili_inode, XFS_ISTALE); } + lip = lip->li_bio_list; + } + - if (xfs_inode_clean(ip)) { - read_unlock(&ih->ih_lock); + /* + * For each inode in memory attempt to add it to the inode + * buffer and set it up for being staled on buffer IO + * completion. This is safe as we've locked out tail pushing + * and flushing by locking the buffer. + * + * We have already marked every inode that was part of a + * transaction stale above, which means there is no point in + * even trying to lock them. + */ + for (i = 0; i < inodes_per_cluster; i++) { +retry: + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, + XFS_INO_TO_AGINO(mp, (inum + i))); + + /* Inode not in memory, nothing to do */ + if (!ip) { + rcu_read_unlock(); continue; } - /* If we can get the locks then add it to the - * list, otherwise by the time we get the bp lock - * below it will already be attached to the - * inode buffer. - */ - - /* This inode will already be locked - by us, lets - * keep it that way. + /* + * because this is an RCU protected lookup, we could + * find a recently freed or even reallocated inode + * during the lookup. We need to check under the + * i_flags_lock for a valid inode here. Skip it if it + * is not valid, the wrong inode or stale. */ - - if (ip == free_ip) { - if (xfs_iflock_nowait(ip)) { - ip->i_flags |= XFS_ISTALE; - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - } else { - ip_found[found++] = ip; - } - } - read_unlock(&ih->ih_lock); + spin_lock(&ip->i_flags_lock); + if (ip->i_ino != inum + i || + __xfs_iflags_test(ip, XFS_ISTALE)) { + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); continue; } + spin_unlock(&ip->i_flags_lock); - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - if (xfs_iflock_nowait(ip)) { - ip->i_flags |= XFS_ISTALE; - - if (xfs_inode_clean(ip)) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else { - ip_found[found++] = ip; - } - } else { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ + if (ip != free_ip && + !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; } + rcu_read_unlock(); - read_unlock(&ih->ih_lock); - } - - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * blks_per_cluster, - XFS_BUF_LOCK); + xfs_iflock(ip); + xfs_iflags_set(ip, XFS_ISTALE); - pre_flushed = 0; - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - while (lip) { - if (lip->li_type == XFS_LI_INODE) { - iip = (xfs_inode_log_item_t *)lip; - ASSERT(iip->ili_logged == 1); - lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; - AIL_LOCK(mp,s); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); - iip->ili_inode->i_flags |= XFS_ISTALE; - pre_flushed++; - } - lip = lip->li_bio_list; - } - - for (i = 0; i < found; i++) { - ip = ip_found[i]; + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - - if (!iip) { - ip->i_update_core = 0; + if (!iip || xfs_inode_clean(ip)) { + ASSERT(ip != free_ip); xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; iip->ili_logged = 1; - AIL_LOCK(mp,s); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); - - xfs_buf_attach_iodone(bp, - (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_istale_done, (xfs_log_item_t *)iip); - if (ip != free_ip) { + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); + + xfs_buf_attach_iodone(bp, xfs_istale_done, + &iip->ili_item); + + if (ip != free_ip) xfs_iunlock(ip, XFS_ILOCK_EXCL); - } } - if (found || pre_flushed) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } - kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); + xfs_perag_put(pag); + return 0; } /* @@ -2317,33 +2363,28 @@ xfs_ifree( int delete; xfs_ino_t first_ino; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(ip->i_transp == tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); - ASSERT((ip->i_d.di_size == 0) || - ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); + ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode)); ASSERT(ip->i_d.di_nblocks == 0); /* * Pull the on-disk inode from the AGI unlinked list. */ error = xfs_iunlink_remove(tp, ip); - if (error != 0) { + if (error) return error; - } error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); - if (error != 0) { + if (error) return error; - } + ip->i_d.di_mode = 0; /* mark incore inode as free */ ip->i_d.di_flags = 0; ip->i_d.di_dmevmask = 0; ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ - ip->i_df.if_ext_max = - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; /* @@ -2353,1021 +2394,873 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) { - xfs_ifree_cluster(ip, tp, first_ino); - } + if (delete) + error = xfs_ifree_cluster(ip, tp, first_ino); - return 0; + return error; } /* - * Reallocate the space for if_broot based on the number of records - * being added or deleted as indicated in rec_diff. Move the records - * and pointers in if_broot to fit the new size. When shrinking this - * will eliminate holes between the records and pointers created by - * the caller. When growing this will create holes to be filled in - * by the caller. - * - * The caller must not request to add more records than would fit in - * the on-disk inode root. If the if_broot is currently NULL, then - * if we adding records one will be allocated. The caller must also - * not request that the number of records go below zero, although - * it can go to zero. - * - * ip -- the inode whose if_broot area is changing - * ext_diff -- the change in the number of records, positive or negative, - * requested for the if_broot array. + * This is called to unpin an inode. The caller must have the inode locked + * in at least shared mode so that the buffer cannot be subsequently pinned + * once someone is waiting for it to be unpinned. */ -void -xfs_iroot_realloc( - xfs_inode_t *ip, - int rec_diff, - int whichfork) +static void +xfs_iunpin( + struct xfs_inode *ip) { - int cur_max; - xfs_ifork_t *ifp; - xfs_bmbt_block_t *new_broot; - int new_max; - size_t new_size; - char *np; - char *op; + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); - /* - * Handle the degenerate case quietly. - */ - if (rec_diff == 0) { - return; - } + trace_xfs_inode_unpin_nowait(ip, _RET_IP_); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (rec_diff > 0) { - /* - * If there wasn't any memory allocated before, just - * allocate it now and get out. - */ - if (ifp->if_broot_bytes == 0) { - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff); - ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size, - KM_SLEEP); - ifp->if_broot_bytes = (int)new_size; - return; - } + /* Give the log a push to start the unpinning I/O */ + xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0); - /* - * If there is already an existing if_broot, then we need - * to realloc() it and shift the pointers to their new - * location. The records don't change location because - * they are kept butted up against the btree block header. - */ - cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); - new_max = cur_max + rec_diff; - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); - ifp->if_broot = (xfs_bmbt_block_t *) - kmem_realloc(ifp->if_broot, - new_size, - (size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */ - KM_SLEEP); - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, - (int)new_size); - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); - memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); - return; - } +} - /* - * rec_diff is less than 0. In this case, we are shrinking the - * if_broot buffer. It must already exist. If we go to zero - * records, just get rid of the root and clear the status bit. - */ - ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); - cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes); - new_max = cur_max + rec_diff; - ASSERT(new_max >= 0); - if (new_max > 0) - new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max); - else - new_size = 0; - if (new_size > 0) { - new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP); - /* - * First copy over the btree block header. - */ - memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t)); - } else { - new_broot = NULL; - ifp->if_flags &= ~XFS_IFBROOT; - } +static void +__xfs_iunpin_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); - /* - * Only copy the records and pointers if there are any. - */ - if (new_max > 0) { - /* - * First copy the records. - */ - op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + xfs_iunpin(ip); - /* - * Then copy the pointers. - */ - op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1, - ifp->if_broot_bytes); - np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1, - (int)new_size); - memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); - } - kmem_free(ifp->if_broot, ifp->if_broot_bytes); - ifp->if_broot = new_broot; - ifp->if_broot_bytes = (int)new_size; - ASSERT(ifp->if_broot_bytes <= - XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ); - return; + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (xfs_ipincount(ip)) + io_schedule(); + } while (xfs_ipincount(ip)); + finish_wait(wq, &wait.wait); } +void +xfs_iunpin_wait( + struct xfs_inode *ip) +{ + if (xfs_ipincount(ip)) + __xfs_iunpin_wait(ip); +} /* - * This is called when the amount of space needed for if_data - * is increased or decreased. The change in size is indicated by - * the number of bytes that need to be added or deleted in the - * byte_diff parameter. + * Removing an inode from the namespace involves removing the directory entry + * and dropping the link count on the inode. Removing the directory entry can + * result in locking an AGF (directory blocks were freed) and removing a link + * count can result in placing the inode on an unlinked list which results in + * locking an AGI. + * + * The big problem here is that we have an ordering constraint on AGF and AGI + * locking - inode allocation locks the AGI, then can allocate a new extent for + * new inodes, locking the AGF after the AGI. Similarly, freeing the inode + * removes the inode from the unlinked list, requiring that we lock the AGI + * first, and then freeing the inode can result in an inode chunk being freed + * and hence freeing disk space requiring that we lock an AGF. * - * If the amount of space needed has decreased below the size of the - * inline buffer, then switch to using the inline buffer. Otherwise, - * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer - * to what is needed. + * Hence the ordering that is imposed by other parts of the code is AGI before + * AGF. This means we cannot remove the directory entry before we drop the inode + * reference count and put it on the unlinked list as this results in a lock + * order of AGF then AGI, and this can deadlock against inode allocation and + * freeing. Therefore we must drop the link counts before we remove the + * directory entry. * - * ip -- the inode whose if_data area is changing - * byte_diff -- the change in the number of bytes, positive or negative, - * requested for the if_data array. + * This is still safe from a transactional point of view - it is not until we + * get to xfs_bmap_finish() that we have the possibility of multiple + * transactions in this operation. Hence as long as we remove the directory + * entry and drop the link count in the first transaction of the remove + * operation, there are no transactional constraints on the ordering here. */ -void -xfs_idata_realloc( - xfs_inode_t *ip, - int byte_diff, - int whichfork) +int +xfs_remove( + xfs_inode_t *dp, + struct xfs_name *name, + xfs_inode_t *ip) { - xfs_ifork_t *ifp; - int new_size; - int real_size; + xfs_mount_t *mp = dp->i_mount; + xfs_trans_t *tp = NULL; + int is_dir = S_ISDIR(ip->i_d.di_mode); + int error = 0; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + int link_zero; + uint resblks; + uint log_count; + + trace_xfs_remove(dp, name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); - if (byte_diff == 0) { - return; - } + error = xfs_qm_dqattach(dp, 0); + if (error) + goto std_return; - ifp = XFS_IFORK_PTR(ip, whichfork); - new_size = (int)ifp->if_bytes + byte_diff; - ASSERT(new_size >= 0); + error = xfs_qm_dqattach(ip, 0); + if (error) + goto std_return; - if (new_size == 0) { - if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); - } - ifp->if_u1.if_data = NULL; - real_size = 0; - } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { - /* - * If the valid extents/data can fit in if_inline_ext/data, - * copy them from the malloc'd vector and free it. - */ - if (ifp->if_u1.if_data == NULL) { - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - ASSERT(ifp->if_real_bytes != 0); - memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, - new_size); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); - ifp->if_u1.if_data = ifp->if_u2.if_inline_data; - } - real_size = 0; + if (is_dir) { + tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); + log_count = XFS_DEFAULT_LOG_COUNT; } else { - /* - * Stuck with malloc/realloc. - * For inline data, the underlying buffer must be - * a multiple of 4 bytes in size so that it can be - * logged and stay on word boundaries. We enforce - * that here. - */ - real_size = roundup(new_size, 4); - if (ifp->if_u1.if_data == NULL) { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); - } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { - /* - * Only do the realloc if the underlying size - * is really changing. - */ - if (ifp->if_real_bytes != real_size) { - ifp->if_u1.if_data = - kmem_realloc(ifp->if_u1.if_data, - real_size, - ifp->if_real_bytes, - KM_SLEEP); - } - } else { - ASSERT(ifp->if_real_bytes == 0); - ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP); - memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, - ifp->if_bytes); - } + tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); + log_count = XFS_REMOVE_LOG_COUNT; } - ifp->if_real_bytes = real_size; - ifp->if_bytes = new_size; - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); -} - - - - -/* - * Map inode to disk block and offset. - * - * mp -- the mount point structure for the current file system - * tp -- the current transaction - * ino -- the inode number of the inode to be located - * imap -- this structure is filled in with the information necessary - * to retrieve the given inode from disk - * flags -- flags to pass to xfs_dilocate indicating whether or not - * lookups in the inode btree were OK or not - */ -int -xfs_imap( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - xfs_imap_t *imap, - uint flags) -{ - xfs_fsblock_t fsbno; - int len; - int off; - int error; + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - fsbno = imap->im_blkno ? - XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; - error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); - if (error != 0) { - return error; + /* + * We try to get the real space reservation first, + * allowing for directory btree deletion(s) implying + * possible bmap insert(s). If we can't get the space + * reservation then we use 0 instead, and avoid the bmap + * btree insert(s) in the directory code by, if the bmap + * insert tries to happen, instead trimming the LAST + * block from the directory. + */ + resblks = XFS_REMOVE_SPACE_RES(mp); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); + if (error == ENOSPC) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); + } + if (error) { + ASSERT(error != ENOSPC); + cancel_flags = 0; + goto out_trans_cancel; } - imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); - imap->im_len = XFS_FSB_TO_BB(mp, len); - imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); - imap->im_ioffset = (ushort)off; - imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); - return 0; -} -void -xfs_idestroy_fork( - xfs_inode_t *ip, - int whichfork) -{ - xfs_ifork_t *ifp; + xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - ifp = XFS_IFORK_PTR(ip, whichfork); - if (ifp->if_broot != NULL) { - kmem_free(ifp->if_broot, ifp->if_broot_bytes); - ifp->if_broot = NULL; - } + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * If the format is local, then we can't have an extents - * array so just look for an inline data array. If we're - * not local then we may or may not have an extents list, - * so check and free it up if we do. + * If we're removing a directory perform some additional validation. */ - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { - if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && - (ifp->if_u1.if_data != NULL)) { - ASSERT(ifp->if_real_bytes != 0); - kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes); - ifp->if_u1.if_data = NULL; - ifp->if_real_bytes = 0; + cancel_flags |= XFS_TRANS_ABORT; + if (is_dir) { + ASSERT(ip->i_d.di_nlink >= 2); + if (ip->i_d.di_nlink != 2) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; + } + if (!xfs_dir_isempty(ip)) { + error = XFS_ERROR(ENOTEMPTY); + goto out_trans_cancel; } - } else if ((ifp->if_flags & XFS_IFEXTENTS) && - ((ifp->if_flags & XFS_IFEXTIREC) || - ((ifp->if_u1.if_extents != NULL) && - (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { - ASSERT(ifp->if_real_bytes != 0); - xfs_iext_destroy(ifp); - } - ASSERT(ifp->if_u1.if_extents == NULL || - ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); - ASSERT(ifp->if_real_bytes == 0); - if (whichfork == XFS_ATTR_FORK) { - kmem_zone_free(xfs_ifork_zone, ip->i_afp); - ip->i_afp = NULL; - } -} -/* - * This is called free all the memory associated with an inode. - * It must free the inode itself and any buffers allocated for - * if_extents/if_data and if_broot. It must also free the lock - * associated with the inode. - */ -void -xfs_idestroy( - xfs_inode_t *ip) -{ + /* Drop the link from ip's "..". */ + error = xfs_droplink(tp, dp); + if (error) + goto out_trans_cancel; - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - mrfree(&ip->i_lock); - mrfree(&ip->i_iolock); - freesema(&ip->i_flock); -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BMBT_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { - /* XXXdpd should be able to assert this but shutdown - * is leaving the AIL behind. */ - ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - xfs_inode_item_destroy(ip); + /* Drop the "." link from ip to self. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; + } else { + /* + * When removing a non-directory we need to log the parent + * inode here. For a directory this is done implicitly + * by the xfs_droplink call for the ".." entry. + */ + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); } - kmem_zone_free(xfs_inode_zone, ip); -} + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + /* Drop the link from dp to ip. */ + error = xfs_droplink(tp, ip); + if (error) + goto out_trans_cancel; -/* - * Increment the pin count of the given buffer. - * This value is protected by ipinlock spinlock in the mount structure. - */ -void -xfs_ipin( - xfs_inode_t *ip) -{ - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + /* Determine if this is the last link while the inode is locked */ + link_zero = (ip->i_d.di_nlink == 0); - atomic_inc(&ip->i_pincount); -} + xfs_bmap_init(&free_list, &first_block); + error = xfs_dir_removename(tp, dp, name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) { + ASSERT(error != ENOENT); + goto out_bmap_cancel; + } -/* - * Decrement the pin count of the given inode, and wake up - * anyone in xfs_iwait_unpin() if the count goes to 0. The - * inode must have been previously pinned with a call to xfs_ipin(). - */ -void -xfs_iunpin( - xfs_inode_t *ip) -{ - ASSERT(atomic_read(&ip->i_pincount) > 0); + /* + * If this is a synchronous mount, make sure that the + * remove transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); - if (atomic_dec_and_test(&ip->i_pincount)) { - /* - * If the inode is currently being reclaimed, the - * linux inode _and_ the xfs vnode may have been - * freed so we cannot reference either of them safely. - * Hence we should not try to do anything to them - * if the xfs inode is currently in the reclaim - * path. - * - * However, we still need to issue the unpin wakeup - * call as the inode reclaim may be blocked waiting for - * the inode to become unpinned. - */ - if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out_bmap_cancel; - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto std_return; - if (!(inode->i_state & - (I_NEW|I_FREEING|I_CLEAR))) - mark_inode_dirty_sync(inode); - } - } - wake_up(&ip->i_ipin_wait); - } + if (is_dir && xfs_inode_is_filestream(ip)) + xfs_filestream_deassociate(ip); + + return 0; + + out_bmap_cancel: + xfs_bmap_cancel(&free_list); + out_trans_cancel: + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; } /* - * This is called to wait for the given inode to be unpinned. - * It will sleep until this happens. The caller must have the - * inode locked in at least shared mode so that the buffer cannot - * be subsequently pinned once someone is waiting for it to be - * unpinned. + * Enter all inodes for a rename transaction into a sorted array. */ STATIC void -xfs_iunpin_wait( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - xfs_lsn_t lsn; - - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); - - if (atomic_read(&ip->i_pincount) == 0) { - return; - } - - iip = ip->i_itemp; - if (iip && iip->ili_last_lsn) { - lsn = iip->ili_last_lsn; +xfs_sort_for_rename( + xfs_inode_t *dp1, /* in: old (source) directory inode */ + xfs_inode_t *dp2, /* in: new (target) directory inode */ + xfs_inode_t *ip1, /* in: inode of old entry */ + xfs_inode_t *ip2, /* in: inode of new entry, if it + already exists, NULL otherwise. */ + xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ + int *num_inodes) /* out: number of inodes in array */ +{ + xfs_inode_t *temp; + int i, j; + + /* + * i_tab contains a list of pointers to inodes. We initialize + * the table here & we'll sort it. We will then use it to + * order the acquisition of the inode locks. + * + * Note that the table may contain duplicates. e.g., dp1 == dp2. + */ + i_tab[0] = dp1; + i_tab[1] = dp2; + i_tab[2] = ip1; + if (ip2) { + *num_inodes = 4; + i_tab[3] = ip2; } else { - lsn = (xfs_lsn_t)0; + *num_inodes = 3; + i_tab[3] = NULL; } /* - * Give the log a push so we don't wait here too long. + * Sort the elements via bubble sort. (Remember, there are at + * most 4 elements to sort, so this is adequate.) */ - xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); - - wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { + if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { + temp = i_tab[j]; + i_tab[j] = i_tab[j-1]; + i_tab[j-1] = temp; + } + } + } } - /* - * xfs_iextents_copy() - * - * This is called to copy the REAL extents (as opposed to the delayed - * allocation extents) from the inode into the given buffer. It - * returns the number of bytes copied into the buffer. - * - * If there are no delayed allocation extents, then we can just - * memcpy() the extents into the buffer. Otherwise, we need to - * examine each extent in turn and skip those which are delayed. + * xfs_rename */ int -xfs_iextents_copy( - xfs_inode_t *ip, - xfs_bmbt_rec_t *buffer, - int whichfork) -{ - int copied; - xfs_bmbt_rec_t *dest_ep; - xfs_bmbt_rec_t *ep; -#ifdef XFS_BMAP_TRACE - static char fname[] = "xfs_iextents_copy"; -#endif - int i; - xfs_ifork_t *ifp; - int nrecs; - xfs_fsblock_t start_block; +xfs_rename( + xfs_inode_t *src_dp, + struct xfs_name *src_name, + xfs_inode_t *src_ip, + xfs_inode_t *target_dp, + struct xfs_name *target_name, + xfs_inode_t *target_ip) +{ + xfs_trans_t *tp = NULL; + xfs_mount_t *mp = src_dp->i_mount; + int new_parent; /* moving to a new dir */ + int src_is_directory; /* src_name is a directory */ + int error; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int cancel_flags; + int committed; + xfs_inode_t *inodes[4]; + int spaceres; + int num_inodes; + + trace_xfs_rename(src_dp, target_dp, src_name, target_name); + + new_parent = (src_dp != target_dp); + src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); - ASSERT(ifp->if_bytes > 0); + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork); - ASSERT(nrecs > 0); + xfs_bmap_init(&free_list, &first_block); + tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); + if (error == ENOSPC) { + spaceres = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); + } + if (error) { + xfs_trans_cancel(tp, 0); + goto std_return; + } /* - * There are some delayed allocation extents in the - * inode, so copy the extents one at a time and skip - * the delayed ones. There must be at least one - * non-delayed extent. + * Attach the dquots to the inodes */ - dest_ep = buffer; - copied = 0; - for (i = 0; i < nrecs; i++) { - ep = xfs_iext_get_ext(ifp, i); - start_block = xfs_bmbt_get_startblock(ep); - if (ISNULLSTARTBLOCK(start_block)) { - /* - * It's a delayed allocation extent, so skip it. - */ - continue; - } - - /* Translate to on disk format */ - put_unaligned(INT_GET(ep->l0, ARCH_CONVERT), - (__uint64_t*)&dest_ep->l0); - put_unaligned(INT_GET(ep->l1, ARCH_CONVERT), - (__uint64_t*)&dest_ep->l1); - dest_ep++; - copied++; + error = xfs_qm_vop_rename_dqattach(inodes); + if (error) { + xfs_trans_cancel(tp, cancel_flags); + goto std_return; } - ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, 1, XFS_EXTFMT_INODE(ip)); - return (copied * (uint)sizeof(xfs_bmbt_rec_t)); -} + /* + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); -/* - * Each of the following cases stores data into the same region - * of the on-disk inode, so only one of them can be valid at - * any given time. While it is possible to have conflicting formats - * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is - * in EXTENTS format, this can only happen when the fork has - * changed formats after being modified but before being flushed. - * In these cases, the format always takes precedence, because the - * format indicates the current state of the fork. - */ -/*ARGSUSED*/ -STATIC int -xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, - xfs_inode_log_item_t *iip, - int whichfork, - xfs_buf_t *bp) -{ - char *cp; - xfs_ifork_t *ifp; - xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif - static const short brootflag[2] = - { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; - static const short dataflag[2] = - { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; - static const short extflag[2] = - { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; - - if (iip == NULL) - return 0; - ifp = XFS_IFORK_PTR(ip, whichfork); /* - * This can happen if we gave up in iformat in an error path, - * for the attribute fork. + * Join all the inodes to the transaction. From this point on, + * we can rely on either trans_commit or trans_cancel to unlock + * them. */ - if (ifp == NULL) { - ASSERT(whichfork == XFS_ATTR_FORK); - return 0; + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + if (target_ip) + xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { + error = XFS_ERROR(EXDEV); + goto error_return; } - cp = XFS_DFORK_PTR(dip, whichfork); - mp = ip->i_mount; - switch (XFS_IFORK_FORMAT(ip, whichfork)) { - case XFS_DINODE_FMT_LOCAL: - if ((iip->ili_format.ilf_fields & dataflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(ifp->if_u1.if_data != NULL); - ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); - memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); - } - break; - case XFS_DINODE_FMT_EXTENTS: - ASSERT((ifp->if_flags & XFS_IFEXTENTS) || - !(iip->ili_format.ilf_fields & extflag[whichfork])); - ASSERT((xfs_iext_get_ext(ifp, 0) != NULL) || - (ifp->if_bytes == 0)); - ASSERT((xfs_iext_get_ext(ifp, 0) == NULL) || - (ifp->if_bytes > 0)); - if ((iip->ili_format.ilf_fields & extflag[whichfork]) && - (ifp->if_bytes > 0)) { - ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); - (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, - whichfork); - } - break; + /* + * Set up the target. + */ + if (target_ip == NULL) { + /* + * If there's no space reservation, check the entry will + * fit before actually inserting it. + */ + error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); + if (error) + goto error_return; + /* + * If target does not exist and the rename crosses + * directories, adjust the target directory link count + * to account for the ".." reference from the new entry. + */ + error = xfs_dir_createname(tp, target_dp, target_name, + src_ip->i_ino, &first_block, + &free_list, spaceres); + if (error == ENOSPC) + goto error_return; + if (error) + goto abort_return; - case XFS_DINODE_FMT_BTREE: - if ((iip->ili_format.ilf_fields & brootflag[whichfork]) && - (ifp->if_broot_bytes > 0)) { - ASSERT(ifp->if_broot != NULL); - ASSERT(ifp->if_broot_bytes <= - (XFS_IFORK_SIZE(ip, whichfork) + - XFS_BROOT_SIZE_ADJ)); - xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes, - (xfs_bmdr_block_t *)cp, - XFS_DFORK_SIZE(dip, mp, whichfork)); - } - break; + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - case XFS_DINODE_FMT_DEV: - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { - ASSERT(whichfork == XFS_DATA_FORK); - INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev); + if (new_parent && src_is_directory) { + error = xfs_bumplink(tp, target_dp); + if (error) + goto abort_return; } - break; - - case XFS_DINODE_FMT_UUID: - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { - ASSERT(whichfork == XFS_DATA_FORK); - memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid, - sizeof(uuid_t)); + } else { /* target_ip != NULL */ + /* + * If target exists and it's a directory, check that both + * target and source are directories and that target can be + * destroyed, or that neither is a directory. + */ + if (S_ISDIR(target_ip->i_d.di_mode)) { + /* + * Make sure target dir is empty. + */ + if (!(xfs_dir_isempty(target_ip)) || + (target_ip->i_d.di_nlink > 2)) { + error = XFS_ERROR(EEXIST); + goto error_return; + } } - break; - - default: - ASSERT(0); - break; - } - - return 0; -} -/* - * xfs_iflush() will write a modified inode's changes out to the - * inode's on disk home. The caller must have the inode lock held - * in at least shared mode and the inode flush semaphore must be - * held as well. The inode lock will still be held upon return from - * the call and the caller is free to unlock it. - * The inode flush lock will be unlocked when the inode reaches the disk. - * The flags indicate how the inode's buffer should be written out. - */ -int -xfs_iflush( - xfs_inode_t *ip, - uint flags) -{ - xfs_inode_log_item_t *iip; - xfs_buf_t *bp; - xfs_dinode_t *dip; - xfs_mount_t *mp; - int error; - /* REFERENCED */ - xfs_chash_t *ch; - xfs_inode_t *iq; - int clcount; /* count of inodes clustered */ - int bufwasdelwri; - enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; - SPLDECL(s); + /* + * Link the source inode under the target name. + * If the source inode is a directory and we are moving + * it across directories, its ".." entry will be + * inconsistent until we replace that down below. + * + * In case there is already an entry with the same + * name at the destination directory, remove it first. + */ + error = xfs_dir_replace(tp, target_dp, target_name, + src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; - XFS_STATS_INC(xs_iflush_count); + xfs_trans_ichgtime(tp, target_dp, + XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); - ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + /* + * Decrement the link count on the target since the target + * dir no longer points to it. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; - iip = ip->i_itemp; - mp = ip->i_mount; + if (src_is_directory) { + /* + * Drop the link from the old "." entry. + */ + error = xfs_droplink(tp, target_ip); + if (error) + goto abort_return; + } + } /* target_ip != NULL */ /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. + * Remove the source. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - ASSERT((iip != NULL) ? - !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); - xfs_ifunlock(ip); - return 0; + if (new_parent && src_is_directory) { + /* + * Rewrite the ".." entry to point to the new + * directory. + */ + error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, + target_dp->i_ino, + &first_block, &free_list, spaceres); + ASSERT(error != EEXIST); + if (error) + goto abort_return; } /* - * We can't flush the inode until it is unpinned, so - * wait for it. We know noone new can pin it, because - * we are holding the inode lock shared and you need - * to hold it exclusively to pin the inode. - */ - xfs_iunpin_wait(ip); - - /* - * This may have been unpinned because the filesystem is shutting - * down forcibly. If that's the case we must not write this inode - * to disk, because the log record didn't make it to disk! + * We always want to hit the ctime on the source inode. + * + * This isn't strictly required by the standards since the source + * inode isn't really being changed, but old unix file systems did + * it and some incremental backup programs won't work without it. */ - if (XFS_FORCED_SHUTDOWN(mp)) { - ip->i_update_core = 0; - if (iip) - iip->ili_format.ilf_fields = 0; - xfs_ifunlock(ip); - return XFS_ERROR(EIO); - } + xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); /* - * Get the buffer containing the on-disk inode. + * Adjust the link count on src_dp. This is necessary when + * renaming a directory, either within one parent when + * the target existed, or across two parent directories. */ - error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); - if (error) { - xfs_ifunlock(ip); - return error; - } + if (src_is_directory && (new_parent || target_ip != NULL)) { - /* - * Decide how buffer will be flushed out. This is done before - * the call to xfs_iflush_int because this field is zeroed by it. - */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { /* - * Flush out the inode buffer according to the directions - * of the caller. In the cases where the caller has given - * us a choice choose the non-delwri case. This is because - * the inode is in the AIL and we need to get it out soon. + * Decrement link count on src_directory since the + * entry that's moved no longer points to it. */ - switch (flags) { - case XFS_IFLUSH_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - flags = 0; - break; - case XFS_IFLUSH_ASYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - default: - ASSERT(0); - flags = 0; - break; - } - } else { - switch (flags) { - case XFS_IFLUSH_DELWRI_ELSE_SYNC: - case XFS_IFLUSH_DELWRI_ELSE_ASYNC: - case XFS_IFLUSH_DELWRI: - flags = INT_DELWRI; - break; - case XFS_IFLUSH_ASYNC: - flags = INT_ASYNC; - break; - case XFS_IFLUSH_SYNC: - flags = 0; - break; - default: - ASSERT(0); - flags = 0; - break; - } + error = xfs_droplink(tp, src_dp); + if (error) + goto abort_return; } + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); + if (error) + goto abort_return; + + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + /* - * First flush out the inode that xfs_iflush was called with. + * If this is a synchronous mount, make sure that the + * rename transaction goes to disk before returning to + * the user. */ - error = xfs_iflush_int(ip, bp); + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { - goto corrupt_out; + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT)); + goto std_return; } /* - * inode clustering: - * see if other inodes can be gathered into this write + * trans_commit will unlock src_ip, target_ip & decrement + * the vnode references. */ + return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - ip->i_chash->chl_buf = bp; + abort_return: + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_bmap_cancel(&free_list); + xfs_trans_cancel(tp, cancel_flags); + std_return: + return error; +} - ch = XFS_CHASH(mp, ip->i_blkno); - s = mutex_spinlock(&ch->ch_lock); +STATIC int +xfs_iflush_cluster( + xfs_inode_t *ip, + xfs_buf_t *bp) +{ + xfs_mount_t *mp = ip->i_mount; + struct xfs_perag *pag; + unsigned long first_index, mask; + unsigned long inodes_per_cluster; + int ilist_size; + xfs_inode_t **ilist; + xfs_inode_t *iq; + int nr_found; + int clcount = 0; + int bufwasdelwri; + int i; + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + + inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); + if (!ilist) + goto out_put; + + mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; + rcu_read_lock(); + /* really need a gang lookup range call here */ + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + first_index, inodes_per_cluster); + if (nr_found == 0) + goto out_free; + + for (i = 0; i < nr_found; i++) { + iq = ilist[i]; + if (iq == ip) + continue; + + /* + * because this is an RCU protected lookup, we could find a + * recently freed or even reallocated inode during the lookup. + * We need to check under the i_flags_lock for a valid inode + * here. Skip it if it is not valid or the wrong inode. + */ + spin_lock(&ip->i_flags_lock); + if (!ip->i_ino || + (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&ip->i_flags_lock); + continue; + } + spin_unlock(&ip->i_flags_lock); - clcount = 0; - for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) { /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - iip = iq->i_itemp; - if ((iq->i_update_core == 0) && - ((iip == NULL) || - !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) && - xfs_ipincount(iq) == 0) { + if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) continue; - } /* - * Try to get locks. If any are unavailable, + * Try to get locks. If any are unavailable or it is pinned, * then this inode cannot be flushed and is skipped. */ - /* get inode locks (just i_lock) */ - if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) { - /* get inode flush lock */ - if (xfs_iflock_nowait(iq)) { - /* check if pinned */ - if (xfs_ipincount(iq) == 0) { - /* arriving here means that - * this inode can be flushed. - * first re-check that it's - * dirty - */ - iip = iq->i_itemp; - if ((iq->i_update_core != 0)|| - ((iip != NULL) && - (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { - clcount++; - error = xfs_iflush_int(iq, bp); - if (error) { - xfs_iunlock(iq, - XFS_ILOCK_SHARED); - goto cluster_corrupt_out; - } - } else { - xfs_ifunlock(iq); - } - } else { - xfs_ifunlock(iq); - } - } + if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + continue; + if (!xfs_iflock_nowait(iq)) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + if (xfs_ipincount(iq)) { + xfs_ifunlock(iq); xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + + /* + * arriving here means that this inode can be flushed. First + * re-check that it's dirty before flushing. + */ + if (!xfs_inode_clean(iq)) { + int error; + error = xfs_iflush_int(iq, bp); + if (error) { + xfs_iunlock(iq, XFS_ILOCK_SHARED); + goto cluster_corrupt_out; + } + clcount++; + } else { + xfs_ifunlock(iq); } + xfs_iunlock(iq, XFS_ILOCK_SHARED); } - mutex_spinunlock(&ch->ch_lock, s); if (clcount) { XFS_STATS_INC(xs_icluster_flushcnt); XFS_STATS_ADD(xs_icluster_flushinode, clcount); } - /* - * If the buffer is pinned then push on the log so we won't - * get stuck waiting in the write for too long. - */ - if (XFS_BUF_ISPINNED(bp)){ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - } - - if (flags & INT_DELWRI) { - xfs_bdwrite(mp, bp); - } else if (flags & INT_ASYNC) { - xfs_bawrite(mp, bp); - } else { - error = xfs_bwrite(mp, bp); - } - return error; +out_free: + rcu_read_unlock(); + kmem_free(ilist); +out_put: + xfs_perag_put(pag); + return 0; -corrupt_out: - xfs_buf_relse(bp); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - xfs_iflush_abort(ip); - /* - * Unlocks the flush lock - */ - return XFS_ERROR(EFSCORRUPTED); cluster_corrupt_out: - /* Corruption detected in the clustering loop. Invalidate the + /* + * Corruption detected in the clustering loop. Invalidate the * inode buffer and shut down the filesystem. */ - mutex_spinunlock(&ch->ch_lock, s); - + rcu_read_unlock(); /* - * Clean up the buffer. If it was B_DELWRI, just release it -- + * Clean up the buffer. If it was delwri, just release it -- * brelse can handle it with no problems. If not, shut down the * filesystem before releasing the buffer. */ - if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) { + bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q); + if (bufwasdelwri) xfs_buf_relse(bp); - } xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if(!bufwasdelwri) { + if (!bufwasdelwri) { /* * Just like incore_relse: if we have b_iodone functions, * mark the buffer as an error and call them. Otherwise * mark it as stale and brelse. */ - if (XFS_BUF_IODONE_FUNC(bp)) { - XFS_BUF_CLR_BDSTRAT_FUNC(bp); + if (bp->b_iodone) { XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_SHUT(bp); - XFS_BUF_ERROR(bp,EIO); - xfs_biodone(bp); + xfs_buf_stale(bp); + xfs_buf_ioerror(bp, EIO); + xfs_buf_ioend(bp, 0); } else { - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); } } - xfs_iflush_abort(iq); /* * Unlocks the flush lock */ + xfs_iflush_abort(iq, false); + kmem_free(ilist); + xfs_perag_put(pag); return XFS_ERROR(EFSCORRUPTED); } - -STATIC int -xfs_iflush_int( - xfs_inode_t *ip, - xfs_buf_t *bp) +/* + * Flush dirty inode metadata into the backing buffer. + * + * The caller must have the inode lock and the inode flush lock held. The + * inode lock will still be held upon return to the caller, and the inode + * flush lock will be released after the inode has reached the disk. + * + * The caller must write out the buffer returned in *bpp and release it. + */ +int +xfs_iflush( + struct xfs_inode *ip, + struct xfs_buf **bpp) { - xfs_inode_log_item_t *iip; - xfs_dinode_t *dip; - xfs_mount_t *mp; -#ifdef XFS_TRANS_DEBUG - int first; -#endif - SPLDECL(s); + struct xfs_mount *mp = ip->i_mount; + struct xfs_buf *bp; + struct xfs_dinode *dip; + int error; + + XFS_STATS_INC(xs_iflush_count); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || - ip->i_d.di_nextents > ip->i_df.if_ext_max); + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); - iip = ip->i_itemp; - mp = ip->i_mount; + *bpp = NULL; + xfs_iunpin_wait(ip); /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_imap_to_bp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) { + if (xfs_iflags_test(ip, XFS_ISTALE)) { xfs_ifunlock(ip); return 0; } - /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset); + /* + * This may have been unpinned because the filesystem is shutting + * down forcibly. If that's the case we must not write this inode + * to disk, because the log record didn't make it to disk. + * + * We also have to remove the log item from the AIL in this case, + * as we wait for an empty AIL as part of the unmount process. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto abort_out; + } /* - * Clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps. - * I believe that this depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. + * Get the buffer containing the on-disk inode. */ - ip->i_update_core = 0; - SYNCHRONIZE(); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, + 0); + if (error || !bp) { + xfs_ifunlock(ip); + return error; + } /* - * Make sure to get the latest atime from the Linux inode. + * First flush out the inode that xfs_iflush was called with. */ - xfs_synchronize_atime(ip); + error = xfs_iflush_int(ip, bp); + if (error) + goto corrupt_out; - if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC, + /* + * If the buffer is pinned then push on the log now so we won't + * get stuck waiting in the write for too long. + */ + if (xfs_buf_ispinned(bp)) + xfs_log_force(mp, 0); + + /* + * inode clustering: + * see if other inodes can be gathered into this write + */ + error = xfs_iflush_cluster(ip, bp); + if (error) + goto cluster_corrupt_out; + + *bpp = bp; + return 0; + +corrupt_out: + xfs_buf_relse(bp); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +cluster_corrupt_out: + error = XFS_ERROR(EFSCORRUPTED); +abort_out: + /* + * Unlocks the flush lock + */ + xfs_iflush_abort(ip, false); + return error; +} + +STATIC int +xfs_iflush_int( + struct xfs_inode *ip, + struct xfs_buf *bp) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); + ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || + ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); + ASSERT(iip != NULL && iip->ili_fields != 0); + ASSERT(ip->i_d.di_version > 1); + + /* set *dip = inode's place in the buffer */ + dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + + if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", - ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip); + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p", + __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", - ip->i_ino, ip, ip->i_d.di_magic); + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x", + __func__, ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { + if (S_ISREG(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", - ip->i_ino, ip); + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad regular inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + } else if (S_ISDIR(ip->i_d.di_mode)) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", - ip->i_ino, ip); + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: Bad directory inode %Lu, ptr 0x%p", + __func__, ip->i_ino, ip); goto corrupt_out; } } if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, XFS_RANDOM_IFLUSH_5)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", - ip->i_ino, + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: detected corrupt incore inode %Lu, " + "total extents = %d, nblocks = %Ld, ptr 0x%p", + __func__, ip->i_ino, ip->i_d.di_nextents + ip->i_d.di_anextents, - ip->i_d.di_nblocks, - ip); + ip->i_d.di_nblocks, ip); goto corrupt_out; } if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { - xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, - "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", - ip->i_ino, ip->i_d.di_forkoff, ip); + xfs_alert_tag(mp, XFS_PTAG_IFLUSH, + "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p", + __func__, ip->i_ino, ip->i_d.di_forkoff, ip); goto corrupt_out; } + /* - * bump the flush iteration count, used to detect flushes which - * postdate a log record during recovery. + * Inode item log recovery for v2 inodes are dependent on the + * di_flushiter count for correct sequencing. We bump the flush + * iteration count so we can detect flushes which postdate a log record + * during recovery. This is redundant as we now log every change and + * hence this can't happen but we need to still do it to ensure + * backwards compatibility with old kernels that predate logging all + * inode changes. */ - - ip->i_d.di_flushiter++; + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; /* * Copy the dirty parts of the inode into the on-disk @@ -3375,1367 +3268,68 @@ xfs_iflush_int( * because if the inode is dirty at all the core must * be. */ - xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1); + xfs_dinode_to_disk(dip, &ip->i_d); /* Wrap, we never let the log put out DI_MAX_FLUSH */ if (ip->i_d.di_flushiter == DI_MAX_FLUSH) ip->i_d.di_flushiter = 0; - /* - * If this is really an old format inode and the superblock version - * has not been updated to support only new format inodes, then - * convert back to the old inode format. If the superblock version - * has been updated, then make the conversion permanent. - */ - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - XFS_SB_VERSION_HASNLINK(&mp->m_sb)); - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink); - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = XFS_DINODE_VERSION_2; - INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2); - ip->i_d.di_onlink = 0; - dip->di_core.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - memset(&(dip->di_core.di_pad[0]), 0, - sizeof(dip->di_core.di_pad)); - ASSERT(ip->i_d.di_projid == 0); - } - } - - if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { - goto corrupt_out; - } - - if (XFS_IFORK_Q(ip)) { - /* - * The only error from xfs_iflush_fork is on the data fork. - */ - (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp); - } + xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); + if (XFS_IFORK_Q(ip)) + xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); xfs_inobp_check(mp, bp); /* - * We've recorded everything logged in the inode, so we'd - * like to clear the ilf_fields bits so we don't log and - * flush things unnecessarily. However, we can't stop - * logging all this information until the data we've copied - * into the disk buffer is written to disk. If we did we might - * overwrite the copy of the inode in the log with all the - * data after re-logging only part of it, and in the face of - * a crash we wouldn't have all the data we need to recover. + * We've recorded everything logged in the inode, so we'd like to clear + * the ili_fields bits so we don't log and flush things unnecessarily. + * However, we can't stop logging all this information until the data + * we've copied into the disk buffer is written to disk. If we did we + * might overwrite the copy of the inode in the log with all the data + * after re-logging only part of it, and in the face of a crash we + * wouldn't have all the data we need to recover. * - * What we do is move the bits to the ili_last_fields field. - * When logging the inode, these bits are moved back to the - * ilf_fields field. In the xfs_iflush_done() routine we - * clear ili_last_fields, since we know that the information - * those bits represent is permanently on disk. As long as - * the flush completes before the inode is logged again, then - * both ilf_fields and ili_last_fields will be cleared. + * What we do is move the bits to the ili_last_fields field. When + * logging the inode, these bits are moved back to the ili_fields field. + * In the xfs_iflush_done() routine we clear ili_last_fields, since we + * know that the information those bits represent is permanently on + * disk. As long as the flush completes before the inode is logged + * again, then both ili_fields and ili_last_fields will be cleared. * - * We can play with the ilf_fields bits here, because the inode - * lock must be held exclusively in order to set bits there - * and the flush lock protects the ili_last_fields bits. - * Set ili_logged so the flush done - * routine can tell whether or not to look in the AIL. - * Also, store the current LSN of the inode so that we can tell - * whether the item has moved in the AIL from xfs_iflush_done(). - * In order to read the lsn we need the AIL lock, because - * it is a 64 bit value that cannot be read atomically. - */ - if (iip != NULL && iip->ili_format.ilf_fields != 0) { - iip->ili_last_fields = iip->ili_format.ilf_fields; - iip->ili_format.ilf_fields = 0; - iip->ili_logged = 1; - - ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ - AIL_LOCK(mp,s); - iip->ili_flush_lsn = iip->ili_item.li_lsn; - AIL_UNLOCK(mp, s); - - /* - * Attach the function xfs_iflush_done to the inode's - * buffer. This will remove the inode from the AIL - * and unlock the inode's flush lock when the inode is - * completely written to disk. - */ - xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_iflush_done, (xfs_log_item_t *)iip); - - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL); - } else { - /* - * We're flushing an inode which is not in the AIL and has - * not been logged but has i_update_core set. For this - * case we can use a B_DELWRI flush and immediately drop - * the inode flush lock because we can avoid the whole - * AIL state thing. It's OK to drop the flush lock now, - * because we've already locked the buffer and to do anything - * you really need both. - */ - if (iip != NULL) { - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_last_fields == 0); - ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0); - } - xfs_ifunlock(ip); - } - - return 0; - -corrupt_out: - return XFS_ERROR(EFSCORRUPTED); -} - - -/* - * Flush all inactive inodes in mp. - */ -void -xfs_iflush_all( - xfs_mount_t *mp) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - again: - XFS_MOUNT_ILOCK(mp); - ip = mp->m_inodes; - if (ip == NULL) - goto out; - - do { - /* Make sure we skip markers inserted by sync */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - - vp = XFS_ITOV_NULL(ip); - if (!vp) { - XFS_MOUNT_IUNLOCK(mp); - xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC); - goto again; - } - - ASSERT(vn_count(vp) == 0); - - ip = ip->i_mnext; - } while (ip != mp->m_inodes); - out: - XFS_MOUNT_IUNLOCK(mp); -} - -/* - * xfs_iaccess: check accessibility of inode for mode. - */ -int -xfs_iaccess( - xfs_inode_t *ip, - mode_t mode, - cred_t *cr) -{ - int error; - mode_t orgmode = mode; - struct inode *inode = vn_to_inode(XFS_ITOV(ip)); - - if (mode & S_IWUSR) { - umode_t imode = inode->i_mode; - - if (IS_RDONLY(inode) && - (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode))) - return XFS_ERROR(EROFS); - - if (IS_IMMUTABLE(inode)) - return XFS_ERROR(EACCES); - } - - /* - * If there's an Access Control List it's used instead of - * the mode bits. - */ - if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1) - return error ? XFS_ERROR(error) : 0; - - if (current_fsuid(cr) != ip->i_d.di_uid) { - mode >>= 3; - if (!in_group_p((gid_t)ip->i_d.di_gid)) - mode >>= 3; - } - - /* - * If the DACs are ok we don't need any capability check. - */ - if ((ip->i_d.di_mode & mode) == mode) - return 0; - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. - */ - if (!(orgmode & S_IXUSR) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) - if (capable_cred(cr, CAP_DAC_OVERRIDE)) - return 0; - - if ((orgmode == S_IRUSR) || - (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) { - if (capable_cred(cr, CAP_DAC_READ_SEARCH)) - return 0; -#ifdef NOISE - cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode); -#endif /* NOISE */ - return XFS_ERROR(EACCES); - } - return XFS_ERROR(EACCES); -} - -/* - * xfs_iroundup: round up argument to next power of two - */ -uint -xfs_iroundup( - uint v) -{ - int i; - uint m; - - if ((v & (v - 1)) == 0) - return v; - ASSERT((v & 0x80000000) == 0); - if ((v & (v + 1)) == 0) - return v + 1; - for (i = 0, m = 1; i < 31; i++, m <<= 1) { - if (v & m) - continue; - v |= m; - if ((v & (v + 1)) == 0) - return v + 1; - } - ASSERT(0); - return( 0 ); -} - -#ifdef XFS_ILOCK_TRACE -ktrace_t *xfs_ilock_trace_buf; - -void -xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra) -{ - ktrace_enter(ip->i_lock_trace, - (void *)ip, - (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */ - (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */ - (void *)ra, /* caller of ilock */ - (void *)(unsigned long)current_cpu(), - (void *)(unsigned long)current_pid(), - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL); -} -#endif - -/* - * Return a pointer to the extent record at file index idx. - */ -xfs_bmbt_rec_t * -xfs_iext_get_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx) /* index of target extent */ -{ - ASSERT(idx >= 0); - if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { - return ifp->if_u1.if_ext_irec->er_extbuf; - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_ext_irec_t *erp; /* irec pointer */ - int erp_idx = 0; /* irec index */ - xfs_extnum_t page_idx = idx; /* ext index in target list */ - - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - return &erp->er_extbuf[page_idx]; - } else if (ifp->if_bytes) { - return &ifp->if_u1.if_extents[idx]; - } else { - return NULL; - } -} - -/* - * Insert new item(s) into the extent records for incore inode - * fork 'ifp'. 'count' new items are inserted at index 'idx'. - */ -void -xfs_iext_insert( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* starting index of new items */ - xfs_extnum_t count, /* number of inserted items */ - xfs_bmbt_irec_t *new) /* items to insert */ -{ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - xfs_extnum_t i; /* extent record index */ - - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - xfs_iext_add(ifp, idx, count); - for (i = idx; i < idx + count; i++, new++) { - ep = xfs_iext_get_ext(ifp, i); - xfs_bmbt_set_all(ep, new); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be increased. The ext_diff parameter stores the - * number of new extents being added and the idx parameter contains - * the extent index where the new extents will be added. If the new - * extents are being appended, then we just need to (re)allocate and - * initialize the space. Otherwise, if the new extents are being - * inserted into the middle of the existing entries, a bit more work - * is required to make room for the new extents to be inserted. The - * caller is responsible for filling in the new extent entries upon - * return. - */ -void -xfs_iext_add( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* number of extents to add */ -{ - int byte_diff; /* new bytes being added */ - int new_size; /* size of extents after adding */ - xfs_extnum_t nextents; /* number of extents in file */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT((idx >= 0) && (idx <= nextents)); - byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); - new_size = ifp->if_bytes + byte_diff; - /* - * If the new number of extents (nextents + ext_diff) - * fits inside the inode, then continue to use the inline - * extent buffer. - */ - if (nextents + ext_diff <= XFS_INLINE_EXTS) { - if (idx < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], - &ifp->if_u2.if_inline_ext[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); - } - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; - ifp->if_lastex = nextents + ext_diff; - } - /* - * Otherwise use a linear (direct) extent list. - * If the extents are currently inside the inode, - * xfs_iext_realloc_direct will switch us from - * inline to direct extent allocation mode. - */ - else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, new_size); - if (idx < nextents) { - memmove(&ifp->if_u1.if_extents[idx + ext_diff], - &ifp->if_u1.if_extents[idx], - (nextents - idx) * sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); - } - } - /* Indirection array */ - else { - xfs_ext_irec_t *erp; - int erp_idx = 0; - int page_idx = idx; - - ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); - if (ifp->if_flags & XFS_IFEXTIREC) { - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); - } else { - xfs_iext_irec_init(ifp); - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = ifp->if_u1.if_ext_irec; - } - /* Extents fit in target extent page */ - if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { - if (page_idx < erp->er_extcount) { - memmove(&erp->er_extbuf[page_idx + ext_diff], - &erp->er_extbuf[page_idx], - (erp->er_extcount - page_idx) * - sizeof(xfs_bmbt_rec_t)); - memset(&erp->er_extbuf[page_idx], 0, byte_diff); - } - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - } - /* Insert a new extent page */ - else if (erp) { - xfs_iext_add_indirect_multi(ifp, - erp_idx, page_idx, ext_diff); - } - /* - * If extent(s) are being appended to the last page in - * the indirection array and the new extent(s) don't fit - * in the page, then erp is NULL and erp_idx is set to - * the next index needed in the indirection array. - */ - else { - int count = ext_diff; - - while (count) { - erp = xfs_iext_irec_new(ifp, erp_idx); - erp->er_extcount = count; - count -= MIN(count, (int)XFS_LINEAR_EXTS); - if (count) { - erp_idx++; - } - } - } - } - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being added to the indirection - * array and the new extents do not fit in the target extent list. The - * erp_idx parameter contains the irec index for the target extent list - * in the indirection array, and the idx parameter contains the extent - * index within the list. The number of extents being added is stored - * in the count parameter. - * - * |-------| |-------| - * | | | | idx - number of extents before idx - * | idx | | count | - * | | | | count - number of extents being inserted at idx - * |-------| |-------| - * | count | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_add_indirect_multi( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* target extent irec index */ - xfs_extnum_t idx, /* index within target list */ - int count) /* new extents being added */ -{ - int byte_diff; /* new bytes being added */ - xfs_ext_irec_t *erp; /* pointer to irec entry */ - xfs_extnum_t ext_diff; /* number of extents to add */ - xfs_extnum_t ext_cnt; /* new extents still needed */ - xfs_extnum_t nex2; /* extents after idx + count */ - xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ - int nlists; /* number of irec's (lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex2 = erp->er_extcount - idx; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * Save second part of target extent list - * (all extents past */ - if (nex2) { - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP); - memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); - erp->er_extcount -= nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); - memset(&erp->er_extbuf[idx], 0, byte_diff); - } - - /* - * Add the new extents to the end of the target - * list, then allocate new irec record(s) and - * extent buffer(s) as needed to store the rest - * of the new extents. - */ - ext_cnt = count; - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); - if (ext_diff) { - erp->er_extcount += ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - while (ext_cnt) { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); - erp->er_extcount = ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); - ext_cnt -= ext_diff; - } - - /* Add nex2 extents back to indirection array */ - if (nex2) { - xfs_extnum_t ext_avail; - int i; - - byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - i = 0; - /* - * If nex2 extents fit in the current page, append - * nex2_ep after the new extents. - */ - if (nex2 <= ext_avail) { - i = erp->er_extcount; - } - /* - * Otherwise, check if space is available in the - * next page. - */ - else if ((erp_idx < nlists - 1) && - (nex2 <= (ext_avail = XFS_LINEAR_EXTS - - ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { - erp_idx++; - erp++; - /* Create a hole for nex2 extents */ - memmove(&erp->er_extbuf[nex2], erp->er_extbuf, - erp->er_extcount * sizeof(xfs_bmbt_rec_t)); - } - /* - * Final choice, create a new extent page for - * nex2 extents. - */ - else { - erp_idx++; - erp = xfs_iext_irec_new(ifp, erp_idx); - } - memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); - kmem_free(nex2_ep, byte_diff); - erp->er_extcount += nex2; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); - } -} - -/* - * This is called when the amount of space required for incore file - * extents needs to be decreased. The ext_diff parameter stores the - * number of extents to be removed and the idx parameter contains - * the extent index where the extents will be removed from. - * - * If the amount of space needed has decreased below the linear - * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous - * extent array. Otherwise, use kmem_realloc() to adjust the - * size to what is needed. - */ -void -xfs_iext_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else if (ifp->if_flags & XFS_IFEXTIREC) { - xfs_iext_remove_indirect(ifp, idx, ext_diff); - } else if (ifp->if_real_bytes) { - xfs_iext_remove_direct(ifp, idx, ext_diff); - } else { - xfs_iext_remove_inline(ifp, idx, ext_diff); - } - ifp->if_bytes = new_size; -} - -/* - * This removes ext_diff extents from the inline buffer, beginning - * at extent index idx. - */ -void -xfs_iext_remove_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - int nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(((nextents - ext_diff) > 0) && - (nextents - ext_diff) < XFS_INLINE_EXTS); - - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u2.if_inline_ext[idx], - &ifp->if_u2.if_inline_ext[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - } else { - memset(&ifp->if_u2.if_inline_ext[idx], 0, - ext_diff * sizeof(xfs_bmbt_rec_t)); - } -} - -/* - * This removes ext_diff extents from a linear (direct) extent list, - * beginning at extent index idx. If the extents are being removed - * from the end of the list (ie. truncate) then we just need to re- - * allocate the list to remove the extra space. Otherwise, if the - * extents are being removed from the middle of the existing extent - * entries, then we first need to move the extent records beginning - * at idx + ext_diff up in the list to overwrite the records being - * removed, then remove the extra space via kmem_realloc. - */ -void -xfs_iext_remove_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing exts */ - int ext_diff) /* number of extents to remove */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int new_size; /* size of extents after removal */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - new_size = ifp->if_bytes - - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (new_size == 0) { - xfs_iext_destroy(ifp); - return; - } - /* Move extents up in the list (if needed) */ - if (idx + ext_diff < nextents) { - memmove(&ifp->if_u1.if_extents[idx], - &ifp->if_u1.if_extents[idx + ext_diff], - (nextents - (idx + ext_diff)) * - sizeof(xfs_bmbt_rec_t)); - } - memset(&ifp->if_u1.if_extents[nextents - ext_diff], - 0, ext_diff * sizeof(xfs_bmbt_rec_t)); - /* - * Reallocate the direct extent list. If the extents - * will fit inside the inode then xfs_iext_realloc_direct - * will switch from direct to inline extent allocation - * mode for us. + * We can play with the ili_fields bits here, because the inode lock + * must be held exclusively in order to set bits there and the flush + * lock protects the ili_last_fields bits. Set ili_logged so the flush + * done routine can tell whether or not to look in the AIL. Also, store + * the current LSN of the inode so that we can tell whether the item has + * moved in the AIL from xfs_iflush_done(). In order to read the lsn we + * need the AIL lock, because it is a 64 bit value that cannot be read + * atomically. */ - xfs_iext_realloc_direct(ifp, new_size); - ifp->if_bytes = new_size; -} - -/* - * This is called when incore extents are being removed from the - * indirection array and the extents being removed span multiple extent - * buffers. The idx parameter contains the file extent index where we - * want to begin removing extents, and the count parameter contains - * how many extents need to be removed. - * - * |-------| |-------| - * | nex1 | | | nex1 - number of extents before idx - * |-------| | count | - * | | | | count - number of extents being removed at idx - * | count | |-------| - * | | | nex2 | nex2 - number of extents after idx + count - * |-------| |-------| - */ -void -xfs_iext_remove_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t idx, /* index to begin removing extents */ - int count) /* number of extents to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int erp_idx = 0; /* indirection array index */ - xfs_extnum_t ext_cnt; /* extents left to remove */ - xfs_extnum_t ext_diff; /* extents to remove in current list */ - xfs_extnum_t nex1; /* number of extents before idx */ - xfs_extnum_t nex2; /* extents after idx + count */ - int nlists; /* entries in indirection array */ - int page_idx = idx; /* index in target extent list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); - ASSERT(erp != NULL); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nex1 = page_idx; - ext_cnt = count; - while (ext_cnt) { - nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); - ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); - /* - * Check for deletion of entire list; - * xfs_iext_irec_remove() updates extent offsets. - */ - if (ext_diff == erp->er_extcount) { - xfs_iext_irec_remove(ifp, erp_idx); - ext_cnt -= ext_diff; - nex1 = 0; - if (ext_cnt) { - ASSERT(erp_idx < ifp->if_real_bytes / - XFS_IEXT_BUFSZ); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nex1 = 0; - continue; - } else { - break; - } - } - /* Move extents up (if needed) */ - if (nex2) { - memmove(&erp->er_extbuf[nex1], - &erp->er_extbuf[nex1 + ext_diff], - nex2 * sizeof(xfs_bmbt_rec_t)); - } - /* Zero out rest of page */ - memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - - ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); - /* Update remaining counters */ - erp->er_extcount -= ext_diff; - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); - ext_cnt -= ext_diff; - nex1 = 0; - erp_idx++; - erp++; - } - ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact(ifp); -} - -/* - * Create, destroy, or resize a linear (direct) block of extents. - */ -void -xfs_iext_realloc_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new size of extents */ -{ - int rnew_size; /* real new size of extents */ - - rnew_size = new_size; + iip->ili_last_fields = iip->ili_fields; + iip->ili_fields = 0; + iip->ili_logged = 1; - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || - ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && - (new_size != ifp->if_real_bytes))); - - /* Free extent records */ - if (new_size == 0) { - xfs_iext_destroy(ifp); - } - /* Resize direct extent list and zero any new bytes */ - else if (ifp->if_real_bytes) { - /* Check if extents will fit inside the inode */ - if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { - xfs_iext_direct_to_inline(ifp, new_size / - (uint)sizeof(xfs_bmbt_rec_t)); - ifp->if_bytes = new_size; - return; - } - if ((new_size & (new_size - 1)) != 0) { - rnew_size = xfs_iroundup(new_size); - } - if (rnew_size != ifp->if_real_bytes) { - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) - kmem_realloc(ifp->if_u1.if_extents, - rnew_size, - ifp->if_real_bytes, - KM_SLEEP); - } - if (rnew_size > ifp->if_real_bytes) { - memset(&ifp->if_u1.if_extents[ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t)], 0, - rnew_size - ifp->if_real_bytes); - } - } - /* - * Switch from the inline extent buffer to a direct - * extent list. Be sure to include the inline extent - * bytes in new_size. - */ - else { - new_size += ifp->if_bytes; - if ((new_size & (new_size - 1)) != 0) { - rnew_size = xfs_iroundup(new_size); - } - xfs_iext_inline_to_direct(ifp, rnew_size); - } - ifp->if_real_bytes = rnew_size; - ifp->if_bytes = new_size; -} + xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, + &iip->ili_item.li_lsn); -/* - * Switch from linear (direct) extent records to inline buffer. - */ -void -xfs_iext_direct_to_inline( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t nextents) /* number of extents in file */ -{ - ASSERT(ifp->if_flags & XFS_IFEXTENTS); - ASSERT(nextents <= XFS_INLINE_EXTS); /* - * The inline buffer was zeroed when we switched - * from inline to direct extent allocation mode, - * so we don't need to clear it here. + * Attach the function xfs_iflush_done to the inode's + * buffer. This will remove the inode from the AIL + * and unlock the inode's flush lock when the inode is + * completely written to disk. */ - memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, - nextents * sizeof(xfs_bmbt_rec_t)); - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); - ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; - ifp->if_real_bytes = 0; -} + xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item); -/* - * Switch from inline buffer to linear (direct) extent records. - * new_size should already be rounded up to the next power of 2 - * by the caller (when appropriate), so use new_size as it is. - * However, since new_size may be rounded up, we can't update - * if_bytes here. It is the caller's responsibility to update - * if_bytes upon return. - */ -void -xfs_iext_inline_to_direct( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* number of extents in file */ -{ - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) - kmem_alloc(new_size, KM_SLEEP); - memset(ifp->if_u1.if_extents, 0, new_size); - if (ifp->if_bytes) { - memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, - ifp->if_bytes); - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_real_bytes = new_size; -} + /* update the lsn in the on disk inode if required */ + if (ip->i_d.di_version == 3) + dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn); -/* - * Resize an extent indirection array to new_size bytes. - */ -void -xfs_iext_realloc_indirect( - xfs_ifork_t *ifp, /* inode fork pointer */ - int new_size) /* new indirection array size */ -{ - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); - ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); - if (new_size == 0) { - xfs_iext_destroy(ifp); - } else { - ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) - kmem_realloc(ifp->if_u1.if_ext_irec, - new_size, size, KM_SLEEP); - } -} - -/* - * Switch from indirection array to linear (direct) extent allocations. - */ -void -xfs_iext_indirect_to_direct( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_t *ep; /* extent record pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - int size; /* size of file extents */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - size = nextents * sizeof(xfs_bmbt_rec_t); - - xfs_iext_irec_compact_full(ifp); - ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); - - ep = ifp->if_u1.if_ext_irec->er_extbuf; - kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t)); - ifp->if_flags &= ~XFS_IFEXTIREC; - ifp->if_u1.if_extents = ep; - ifp->if_bytes = size; - if (nextents < XFS_LINEAR_EXTS) { - xfs_iext_realloc_direct(ifp, size); - } -} - -/* - * Free incore file extents. - */ -void -xfs_iext_destroy( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - if (ifp->if_flags & XFS_IFEXTIREC) { - int erp_idx; - int nlists; - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { - xfs_iext_irec_remove(ifp, erp_idx); - } - ifp->if_flags &= ~XFS_IFEXTIREC; - } else if (ifp->if_real_bytes) { - kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes); - } else if (ifp->if_bytes) { - memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * - sizeof(xfs_bmbt_rec_t)); - } - ifp->if_u1.if_extents = NULL; - ifp->if_real_bytes = 0; - ifp->if_bytes = 0; -} + /* generate the checksum. */ + xfs_dinode_calc_crc(mp, dip); -/* - * Return a pointer to the extent record for file system block bno. - */ -xfs_bmbt_rec_t * /* pointer to found extent record */ -xfs_iext_bno_to_ext( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - xfs_extnum_t *idxp) /* index of target extent */ -{ - xfs_bmbt_rec_t *base; /* pointer to first extent */ - xfs_filblks_t blockcount = 0; /* number of blocks in extent */ - xfs_bmbt_rec_t *ep = NULL; /* pointer to target extent */ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundary in search */ - xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundary in search */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_fileoff_t startoff = 0; /* start offset of extent */ - - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { - *idxp = 0; - return NULL; - } - low = 0; - if (ifp->if_flags & XFS_IFEXTIREC) { - /* Find target extent list */ - int erp_idx = 0; - erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); - base = erp->er_extbuf; - high = erp->er_extcount - 1; - } else { - base = ifp->if_u1.if_extents; - high = nextents - 1; - } - /* Binary search extent records */ - while (low <= high) { - idx = (low + high) >> 1; - ep = base + idx; - startoff = xfs_bmbt_get_startoff(ep); - blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < startoff) { - high = idx - 1; - } else if (bno >= startoff + blockcount) { - low = idx + 1; - } else { - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - *idxp = idx; - return ep; - } - } - /* Convert back to file-based extent index */ - if (ifp->if_flags & XFS_IFEXTIREC) { - idx += erp->er_extoff; - } - if (bno >= startoff + blockcount) { - if (++idx == nextents) { - ep = NULL; - } else { - ep = xfs_iext_get_ext(ifp, idx); - } - } - *idxp = idx; - return ep; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record for filesystem block bno. Store the index of the - * target irec in *erp_idxp. - */ -xfs_ext_irec_t * /* pointer to found extent record */ -xfs_iext_bno_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number to search for */ - int *erp_idxp) /* irec index of target ext list */ -{ - xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - xfs_ext_irec_t *erp_next; /* next indirection array entry */ - int erp_idx; /* indirection array index */ - int nlists; /* number of extent irec's (lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; - if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { - high = erp_idx - 1; - } else if (erp_next && bno >= - xfs_bmbt_get_startoff(erp_next->er_extbuf)) { - low = erp_idx + 1; - } else { - break; - } - } - *erp_idxp = erp_idx; - return erp; -} - -/* - * Return a pointer to the indirection array entry containing the - * extent record at file extent index *idxp. Store the index of the - * target irec in *erp_idxp and store the page index of the target - * extent record in *idxp. - */ -xfs_ext_irec_t * -xfs_iext_idx_to_irec( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_extnum_t *idxp, /* extent index (file -> page) */ - int *erp_idxp, /* pointer to target irec */ - int realloc) /* new bytes were just added */ -{ - xfs_ext_irec_t *prev; /* pointer to previous irec */ - xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ - int erp_idx; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - int high; /* binary search upper limit */ - int low; /* binary search lower limit */ - xfs_extnum_t page_idx = *idxp; /* extent index in target list */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - ASSERT(page_idx >= 0 && page_idx <= - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp_idx = 0; - low = 0; - high = nlists - 1; - - /* Binary search extent irec's */ - while (low <= high) { - erp_idx = (low + high) >> 1; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - prev = erp_idx > 0 ? erp - 1 : NULL; - if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && - realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { - high = erp_idx - 1; - } else if (page_idx > erp->er_extoff + erp->er_extcount || - (page_idx == erp->er_extoff + erp->er_extcount && - !realloc)) { - low = erp_idx + 1; - } else if (page_idx == erp->er_extoff + erp->er_extcount && - erp->er_extcount == XFS_LINEAR_EXTS) { - ASSERT(realloc); - page_idx = 0; - erp_idx++; - erp = erp_idx < nlists ? erp + 1 : NULL; - break; - } else { - page_idx -= erp->er_extoff; - break; - } - } - *idxp = page_idx; - *erp_idxp = erp_idx; - return(erp); -} - -/* - * Allocate and initialize an indirection array once the space needed - * for incore extents increases above XFS_IEXT_BUFSZ. - */ -void -xfs_iext_irec_init( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - xfs_extnum_t nextents; /* number of extents in file */ - - ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nextents <= XFS_LINEAR_EXTS); - - erp = (xfs_ext_irec_t *) - kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP); - - if (nextents == 0) { - ifp->if_u1.if_extents = (xfs_bmbt_rec_t *) - kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); - } else if (!ifp->if_real_bytes) { - xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); - } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { - xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); - } - erp->er_extbuf = ifp->if_u1.if_extents; - erp->er_extcount = nextents; - erp->er_extoff = 0; - - ifp->if_flags |= XFS_IFEXTIREC; - ifp->if_real_bytes = XFS_IEXT_BUFSZ; - ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); - ifp->if_u1.if_ext_irec = erp; - - return; -} - -/* - * Allocate and initialize a new entry in the indirection array. - */ -xfs_ext_irec_t * -xfs_iext_irec_new( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* index for new irec */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* Resize indirection array */ - xfs_iext_realloc_indirect(ifp, ++nlists * - sizeof(xfs_ext_irec_t)); - /* - * Move records down in the array so the - * new page can use erp_idx. - */ - erp = ifp->if_u1.if_ext_irec; - for (i = nlists - 1; i > erp_idx; i--) { - memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); - } - ASSERT(i == erp_idx); - - /* Initialize new extent record */ - erp = ifp->if_u1.if_ext_irec; - erp[erp_idx].er_extbuf = (xfs_bmbt_rec_t *) - kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP); - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; - memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); - erp[erp_idx].er_extcount = 0; - erp[erp_idx].er_extoff = erp_idx > 0 ? - erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; - return (&erp[erp_idx]); -} - -/* - * Remove a record from the indirection array. - */ -void -xfs_iext_irec_remove( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx) /* irec index to remove */ -{ - xfs_ext_irec_t *erp; /* indirection array pointer */ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - if (erp->er_extbuf) { - xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, - -erp->er_extcount); - kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ); - } - /* Compact extent records */ - erp = ifp->if_u1.if_ext_irec; - for (i = erp_idx; i < nlists - 1; i++) { - memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); - } - /* - * Manually free the last extent record from the indirection - * array. A call to xfs_iext_realloc_indirect() with a size - * of zero would result in a call to xfs_iext_destroy() which - * would in turn call this function again, creating a nasty - * infinite loop. - */ - if (--nlists) { - xfs_iext_realloc_indirect(ifp, - nlists * sizeof(xfs_ext_irec_t)); - } else { - kmem_free(ifp->if_u1.if_ext_irec, - sizeof(xfs_ext_irec_t)); - } - ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; -} - -/* - * This is called to clean up large amounts of unused memory allocated - * by the indirection array. Before compacting anything though, verify - * that the indirection array is still needed and switch back to the - * linear extent list (or even the inline buffer) if possible. The - * compaction policy is as follows: - * - * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space - * No Compaction: Extents occupy at least 50% of allocated space - */ -void -xfs_iext_irec_compact( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_extnum_t nextents; /* number of extents in file */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - - if (nextents == 0) { - xfs_iext_destroy(ifp); - } else if (nextents <= XFS_INLINE_EXTS) { - xfs_iext_indirect_to_direct(ifp); - xfs_iext_direct_to_inline(ifp, nextents); - } else if (nextents <= XFS_LINEAR_EXTS) { - xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { - xfs_iext_irec_compact_pages(ifp); - } -} - -/* - * Combine extents from neighboring extent pages. - */ -void -xfs_iext_irec_compact_pages( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ - int erp_idx = 0; /* indirection array index */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - while (erp_idx < nlists - 1) { - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - erp_next = erp + 1; - if (erp_next->er_extcount <= - (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], - erp_next->er_extbuf, erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += erp_next->er_extcount; - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - } else { - erp_idx++; - } - } -} - -/* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - while (erp_idx < nlists - 1) { - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - /* Remove next page */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - /* Update next page */ - } else { - /* Move rest of page up to become next new page */ - memmove(erp_next->er_extbuf, ep_next, - erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - } - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - -/* - * This is called to update the er_extoff field in the indirection - * array when extents have been added or removed from one of the - * extent lists. erp_idx contains the irec index to begin updating - * at and ext_diff contains the number of extents that were added - * or removed. - */ -void -xfs_iext_irec_update_extoffs( - xfs_ifork_t *ifp, /* inode fork pointer */ - int erp_idx, /* irec index to update */ - int ext_diff) /* number of new extents */ -{ - int i; /* loop counter */ - int nlists; /* number of irec's (ex lists */ + ASSERT(bp->b_fspriv != NULL); + ASSERT(bp->b_iodone != NULL); + return 0; - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - for (i = erp_idx; i < nlists; i++) { - ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; - } +corrupt_out: + return XFS_ERROR(EFSCORRUPTED); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index d10b76ed1e5..f72bffa6726 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -18,498 +18,384 @@ #ifndef __XFS_INODE_H__ #define __XFS_INODE_H__ -/* - * Fork identifiers. - */ -#define XFS_DATA_FORK 0 -#define XFS_ATTR_FORK 1 - -/* - * The following xfs_ext_irec_t struct introduces a second (top) level - * to the in-core extent allocation scheme. These structs are allocated - * in a contiguous block, creating an indirection array where each entry - * (irec) contains a pointer to a buffer of in-core extent records which - * it manages. Each extent buffer is 4k in size, since 4k is the system - * page size on Linux i386 and systems with larger page sizes don't seem - * to gain much, if anything, by using their native page size as the - * extent buffer size. Also, using 4k extent buffers everywhere provides - * a consistent interface for CXFS across different platforms. - * - * There is currently no limit on the number of irec's (extent lists) - * allowed, so heavily fragmented files may require an indirection array - * which spans multiple system pages of memory. The number of extents - * which would require this amount of contiguous memory is very large - * and should not cause problems in the foreseeable future. However, - * if the memory needed for the contiguous array ever becomes a problem, - * it is possible that a third level of indirection may be required. - */ -typedef struct xfs_ext_irec { - xfs_bmbt_rec_t *er_extbuf; /* block of extent records */ - xfs_extnum_t er_extoff; /* extent offset in file */ - xfs_extnum_t er_extcount; /* number of extents in page/block */ -} xfs_ext_irec_t; - -/* - * File incore extent information, present for each of data & attr forks. - */ -#define XFS_IEXT_BUFSZ 4096 -#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) -#define XFS_INLINE_EXTS 2 -#define XFS_INLINE_DATA 32 -typedef struct xfs_ifork { - int if_bytes; /* bytes in if_u1 */ - int if_real_bytes; /* bytes allocated in if_u1 */ - xfs_bmbt_block_t *if_broot; /* file's incore btree root */ - short if_broot_bytes; /* bytes allocated for root */ - unsigned char if_flags; /* per-fork flags */ - unsigned char if_ext_max; /* max # of extent records */ - xfs_extnum_t if_lastex; /* last if_extents used */ - union { - xfs_bmbt_rec_t *if_extents; /* linear map file exts */ - xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ - char *if_data; /* inline file data */ - } if_u1; - union { - xfs_bmbt_rec_t if_inline_ext[XFS_INLINE_EXTS]; - /* very small file extents */ - char if_inline_data[XFS_INLINE_DATA]; - /* very small file data */ - xfs_dev_t if_rdev; /* dev number if special */ - uuid_t if_uuid; /* mount point value */ - } if_u2; -} xfs_ifork_t; - -/* - * Flags for xfs_ichgtime(). - */ -#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ -#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */ -#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */ +#include "xfs_inode_buf.h" +#include "xfs_inode_fork.h" +#include "xfs_dinode.h" /* - * Per-fork incore inode flags. + * Kernel only inode definitions */ -#define XFS_IFINLINE 0x01 /* Inline data is read in */ -#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ -#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ -#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ - -/* - * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate(). - */ -#define XFS_IMAP_LOOKUP 0x1 -#define XFS_IMAP_BULKSTAT 0x2 - -#ifdef __KERNEL__ -struct bhv_desc; -struct bhv_vnode; -struct cred; -struct ktrace; +struct xfs_dinode; +struct xfs_inode; struct xfs_buf; struct xfs_bmap_free; struct xfs_bmbt_irec; -struct xfs_bmbt_block; -struct xfs_inode; struct xfs_inode_log_item; struct xfs_mount; struct xfs_trans; struct xfs_dquot; -#if defined(XFS_ILOCK_TRACE) -#define XFS_ILOCK_KTRACE_SIZE 32 -extern ktrace_t *xfs_ilock_trace_buf; -extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *); -#else -#define xfs_ilock_trace(i,n,f,ra) -#endif - -typedef struct dm_attrs_s { - __uint32_t da_dmevmask; /* DMIG event mask */ - __uint16_t da_dmstate; /* DMIG state info */ - __uint16_t da_pad; /* DMIG extra padding */ -} dm_attrs_t; - -typedef struct xfs_iocore { - void *io_obj; /* pointer to container - * inode or dcxvn structure */ - struct xfs_mount *io_mount; /* fs mount struct ptr */ -#ifdef DEBUG - mrlock_t *io_lock; /* inode IO lock */ - mrlock_t *io_iolock; /* inode IO lock */ -#endif - - /* I/O state */ - xfs_fsize_t io_new_size; /* sz when write completes */ - - /* Miscellaneous state. */ - unsigned int io_flags; /* IO related flags */ - - /* DMAPI state */ - dm_attrs_t io_dmattrs; - -} xfs_iocore_t; - -#define io_dmevmask io_dmattrs.da_dmevmask -#define io_dmstate io_dmattrs.da_dmstate - -#define XFS_IO_INODE(io) ((xfs_inode_t *) ((io)->io_obj)) -#define XFS_IO_DCXVN(io) ((dcxvn_t *) ((io)->io_obj)) - -/* - * Flags in the flags field - */ - -#define XFS_IOCORE_RT 0x1 - -/* - * xfs_iocore prototypes - */ - -extern void xfs_iocore_inode_init(struct xfs_inode *); -extern void xfs_iocore_inode_reinit(struct xfs_inode *); - - -/* - * This is the type used in the xfs inode hash table. - * An array of these is allocated for each mounted - * file system to hash the inodes for that file system. - */ -typedef struct xfs_ihash { - struct xfs_inode *ih_next; - rwlock_t ih_lock; - uint ih_version; -} xfs_ihash_t; - -#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize)) - -/* - * This is the xfs inode cluster hash. This hash is used by xfs_iflush to - * find inodes that share a cluster and can be flushed to disk at the same - * time. - */ -typedef struct xfs_chashlist { - struct xfs_chashlist *chl_next; - struct xfs_chashlist *chl_prev; - struct xfs_inode *chl_ip; - xfs_daddr_t chl_blkno; /* starting block number of - * the cluster */ - struct xfs_buf *chl_buf; /* the inode buffer */ -} xfs_chashlist_t; - -typedef struct xfs_chash { - xfs_chashlist_t *ch_list; - lock_t ch_lock; -} xfs_chash_t; - -#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize)) - - -/* - * This is the xfs in-core inode structure. - * Most of the on-disk inode is embedded in the i_d field. - * - * The extent pointers/inline file space, however, are managed - * separately. The memory for this information is pointed to by - * the if_u1 unions depending on the type of the data. - * This is used to linearize the array of extents for fast in-core - * access. This is used until the file's number of extents - * surpasses XFS_MAX_INCORE_EXTENTS, at which point all extent pointers - * are accessed through the buffer cache. - * - * Other state kept in the in-core inode is used for identification, - * locking, transactional updating, etc of the inode. - * - * Generally, we do not want to hold the i_rlock while holding the - * i_ilock. Hierarchy is i_iolock followed by i_rlock. - * - * xfs_iptr_t contains all the inode fields upto and including the - * i_mnext and i_mprev fields, it is used as a marker in the inode - * chain off the mount structure by xfs_sync calls. - */ - -typedef struct { - struct xfs_ihash *ip_hash; /* pointer to hash header */ - struct xfs_inode *ip_next; /* inode hash link forw */ - struct xfs_inode *ip_mnext; /* next inode in mount list */ - struct xfs_inode *ip_mprev; /* ptr to prev inode */ - struct xfs_inode **ip_prevp; /* ptr to prev i_next */ - struct xfs_mount *ip_mount; /* fs mount struct ptr */ -} xfs_iptr_t; - typedef struct xfs_inode { /* Inode linking and identification information. */ - struct xfs_ihash *i_hash; /* pointer to hash header */ - struct xfs_inode *i_next; /* inode hash link forw */ - struct xfs_inode *i_mnext; /* next inode in mount list */ - struct xfs_inode *i_mprev; /* ptr to prev inode */ - struct xfs_inode **i_prevp; /* ptr to prev i_next */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - struct list_head i_reclaim; /* reclaim list */ - struct bhv_desc i_bhv_desc; /* inode behavior descriptor*/ struct xfs_dquot *i_udquot; /* user dquot */ struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ - xfs_daddr_t i_blkno; /* blkno of inode buffer */ - ushort i_len; /* len of inode buffer */ - ushort i_boffset; /* off of inode in buffer */ + struct xfs_imap i_imap; /* location for xfs_imap() */ /* Extent information. */ xfs_ifork_t *i_afp; /* attribute fork pointer */ xfs_ifork_t i_df; /* data fork */ + /* operations vectors */ + const struct xfs_dir_ops *d_ops; /* directory ops vector */ + /* Transaction and locking information. */ - struct xfs_trans *i_transp; /* ptr to owning transaction*/ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ - sema_t i_flock; /* inode flush lock */ atomic_t i_pincount; /* inode pin count */ - wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ -#ifdef HAVE_REFCACHE - struct xfs_inode **i_refcache; /* ptr to entry in ref cache */ - struct xfs_inode *i_release; /* inode to unref */ -#endif - /* I/O state */ - xfs_iocore_t i_iocore; /* I/O core */ - + spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ - unsigned short i_flags; /* see defined flags below */ - unsigned char i_update_core; /* timestamps/size is dirty */ - unsigned char i_update_size; /* di_size field is dirty */ - unsigned int i_gen; /* generation count */ + unsigned long i_flags; /* see defined flags below */ unsigned int i_delayed_blks; /* count of delay alloc blks */ - xfs_dinode_core_t i_d; /* most of ondisk inode */ - xfs_chashlist_t *i_chash; /* cluster hash list header */ - struct xfs_inode *i_cnext; /* cluster hash link forward */ - struct xfs_inode *i_cprev; /* cluster hash link backward */ - - /* Trace buffers per inode. */ -#ifdef XFS_BMAP_TRACE - struct ktrace *i_xtrace; /* inode extent list trace */ -#endif -#ifdef XFS_BMBT_TRACE - struct ktrace *i_btrace; /* inode bmap btree trace */ -#endif -#ifdef XFS_RW_TRACE - struct ktrace *i_rwtrace; /* inode read/write trace */ -#endif -#ifdef XFS_ILOCK_TRACE - struct ktrace *i_lock_trace; /* inode lock/unlock trace */ -#endif -#ifdef XFS_DIR2_TRACE - struct ktrace *i_dir_trace; /* inode directory trace */ -#endif + xfs_icdinode_t i_d; /* most of ondisk inode */ + + /* VFS inode */ + struct inode i_vnode; /* embedded VFS inode */ } xfs_inode_t; -#endif /* __KERNEL__ */ +/* Convert from vfs inode to xfs inode */ +static inline struct xfs_inode *XFS_I(struct inode *inode) +{ + return container_of(inode, struct xfs_inode, i_vnode); +} + +/* convert from xfs inode to vfs inode */ +static inline struct inode *VFS_I(struct xfs_inode *ip) +{ + return &ip->i_vnode; +} +/* + * For regular files we only update the on-disk filesize when actually + * writing data back to disk. Until then only the copy in the VFS inode + * is uptodate. + */ +static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip) +{ + if (S_ISREG(ip->i_d.di_mode)) + return i_size_read(VFS_I(ip)); + return ip->i_d.di_size; +} /* - * Fork handling. + * If this I/O goes past the on-disk inode size update it unless it would + * be past the current in-core inode size. */ -#define XFS_IFORK_PTR(ip,w) \ - ((w) == XFS_DATA_FORK ? &(ip)->i_df : (ip)->i_afp) -#define XFS_IFORK_Q(ip) XFS_CFORK_Q(&(ip)->i_d) -#define XFS_IFORK_DSIZE(ip) XFS_CFORK_DSIZE(&ip->i_d, ip->i_mount) -#define XFS_IFORK_ASIZE(ip) XFS_CFORK_ASIZE(&ip->i_d, ip->i_mount) -#define XFS_IFORK_SIZE(ip,w) XFS_CFORK_SIZE(&ip->i_d, ip->i_mount, w) -#define XFS_IFORK_FORMAT(ip,w) XFS_CFORK_FORMAT(&ip->i_d, w) -#define XFS_IFORK_FMT_SET(ip,w,n) XFS_CFORK_FMT_SET(&ip->i_d, w, n) -#define XFS_IFORK_NEXTENTS(ip,w) XFS_CFORK_NEXTENTS(&ip->i_d, w) -#define XFS_IFORK_NEXT_SET(ip,w,n) XFS_CFORK_NEXT_SET(&ip->i_d, w, n) +static inline xfs_fsize_t +xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) +{ + xfs_fsize_t i_size = i_size_read(VFS_I(ip)); + if (new_size > i_size) + new_size = i_size; + return new_size > ip->i_d.di_size ? new_size : 0; +} -#ifdef __KERNEL__ +/* + * i_flags helper functions + */ +static inline void +__xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + ip->i_flags |= flags; +} + +static inline void +xfs_iflags_set(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + __xfs_iflags_set(ip, flags); + spin_unlock(&ip->i_flags_lock); +} + +static inline void +xfs_iflags_clear(xfs_inode_t *ip, unsigned short flags) +{ + spin_lock(&ip->i_flags_lock); + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); +} + +static inline int +__xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + return (ip->i_flags & flags); +} + +static inline int +xfs_iflags_test(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + spin_lock(&ip->i_flags_lock); + ret = __xfs_iflags_test(ip, flags); + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (ret) + ip->i_flags &= ~flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +static inline int +xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags) +{ + int ret; + + spin_lock(&ip->i_flags_lock); + ret = ip->i_flags & flags; + if (!ret) + ip->i_flags |= flags; + spin_unlock(&ip->i_flags_lock); + return ret; +} + +/* + * Project quota id helpers (previously projid was 16bit only + * and using two 16bit values to hold new 32bit projid was chosen + * to retain compatibility with "old" filesystems). + */ +static inline prid_t +xfs_get_projid(struct xfs_inode *ip) +{ + return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo; +} + +static inline void +xfs_set_projid(struct xfs_inode *ip, + prid_t projid) +{ + ip->i_d.di_projid_hi = (__uint16_t) (projid >> 16); + ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff); +} + +static inline prid_t +xfs_get_initial_prid(struct xfs_inode *dp) +{ + if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + return xfs_get_projid(dp); + + return XFS_PROJID_DEFAULT; +} /* * In-core inode flags. */ -#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */ -#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */ -#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */ -#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */ -#define XFS_ISTALE 0x0010 /* inode has been staled */ -#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ -#define XFS_INEW 0x0040 +#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ +#define XFS_ISTALE (1 << 1) /* inode has been staled */ +#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ +#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ +#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ +#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ +#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT) +#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ +#define XFS_IPINNED (1 << __XFS_IPINNED_BIT) +#define XFS_IDONTCACHE (1 << 9) /* don't cache the inode long term */ /* - * Flags for inode locking. + * Per-lifetime flags need to be reset when re-using a reclaimable inode during + * inode lookup. This prevents unintended behaviour on the new inode from + * ocurring. */ -#define XFS_IOLOCK_EXCL 0x001 -#define XFS_IOLOCK_SHARED 0x002 -#define XFS_ILOCK_EXCL 0x004 -#define XFS_ILOCK_SHARED 0x008 -#define XFS_IUNLOCK_NONOTIFY 0x010 -/* XFS_IOLOCK_NESTED 0x020 */ -#define XFS_EXTENT_TOKEN_RD 0x040 -#define XFS_SIZE_TOKEN_RD 0x080 -#define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD) -#define XFS_WILLLEND 0x100 /* Always acquire tokens for lending */ -#define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND) -#define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) -#define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) -/* XFS_SIZE_TOKEN_WANT 0x200 */ - -#define XFS_LOCK_MASK \ - (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL | \ - XFS_ILOCK_SHARED | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD | \ - XFS_WILLLEND) +#define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + +/* + * Synchronize processes attempting to flush the in-core inode back to disk. + */ + +extern void __xfs_iflock(struct xfs_inode *ip); + +static inline int xfs_iflock_nowait(struct xfs_inode *ip) +{ + return !xfs_iflags_test_and_set(ip, XFS_IFLOCK); +} + +static inline void xfs_iflock(struct xfs_inode *ip) +{ + if (!xfs_iflock_nowait(ip)) + __xfs_iflock(ip); +} + +static inline void xfs_ifunlock(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_IFLOCK); + smp_mb(); + wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); +} + +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} /* - * Flags for xfs_iflush() + * Flags for inode locking. + * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) + * 1<<16 - 1<<32-1 -- lockdep annotation (integers) */ -#define XFS_IFLUSH_DELWRI_ELSE_SYNC 1 -#define XFS_IFLUSH_DELWRI_ELSE_ASYNC 2 -#define XFS_IFLUSH_SYNC 3 -#define XFS_IFLUSH_ASYNC 4 -#define XFS_IFLUSH_DELWRI 5 +#define XFS_IOLOCK_EXCL (1<<0) +#define XFS_IOLOCK_SHARED (1<<1) +#define XFS_ILOCK_EXCL (1<<2) +#define XFS_ILOCK_SHARED (1<<3) + +#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + +#define XFS_LOCK_FLAGS \ + { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ + { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ + { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ + { XFS_ILOCK_SHARED, "ILOCK_SHARED" } + /* - * Flags for xfs_itruncate_start(). + * Flags for lockdep annotations. + * + * XFS_LOCK_PARENT - for directory operations that require locking a + * parent directory inode and a child entry inode. The parent gets locked + * with this flag so it gets a lockdep subclass of 1 and the child entry + * lock will have a lockdep subclass of 0. + * + * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary + * inodes do not participate in the normal lock order, and thus have their + * own subclasses. + * + * XFS_LOCK_INUMORDER - for locking several inodes at the some time + * with xfs_lock_inodes(). This flag is used as the starting subclass + * and each subsequent lock acquired will increment the subclass by one. + * So the first lock acquired will have a lockdep subclass of 4, the + * second lock will have a lockdep subclass of 5, and so on. It is + * the responsibility of the class builder to shift this to the correct + * portion of the lock_mode lockdep mask. */ -#define XFS_ITRUNC_DEFINITE 0x1 -#define XFS_ITRUNC_MAYBE 0x2 +#define XFS_LOCK_PARENT 1 +#define XFS_LOCK_RTBITMAP 2 +#define XFS_LOCK_RTSUM 3 +#define XFS_LOCK_INUMORDER 4 + +#define XFS_IOLOCK_SHIFT 16 +#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) -#define XFS_ITOV(ip) BHV_TO_VNODE(XFS_ITOBHV(ip)) -#define XFS_ITOV_NULL(ip) BHV_TO_VNODE_NULL(XFS_ITOBHV(ip)) -#define XFS_ITOBHV(ip) ((struct bhv_desc *)(&((ip)->i_bhv_desc))) -#define XFS_BHVTOI(bhvp) ((xfs_inode_t *)((char *)(bhvp) - \ - (char *)&(((xfs_inode_t *)0)->i_bhv_desc))) -#define BHV_IS_XFS(bdp) (BHV_OPS(bdp) == &xfs_vnodeops) +#define XFS_ILOCK_SHIFT 24 +#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) +#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) + +#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_ILOCK_DEP_MASK 0xff000000 +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) + +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) /* * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and * new subdirectory gets S_ISGID bit from parent. */ -#define XFS_INHERIT_GID(pip, vfsp) \ - (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID)) +#define XFS_INHERIT_GID(pip) \ + (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ + ((pip)->i_d.di_mode & S_ISGID)) + + +int xfs_release(struct xfs_inode *ip); +void xfs_inactive(struct xfs_inode *ip); +int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); +int xfs_create(struct xfs_inode *dp, struct xfs_name *name, + umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp); +int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry, + umode_t mode, struct xfs_inode **ipp); +int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, + struct xfs_inode *ip); +int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, + struct xfs_name *target_name); +int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, + struct xfs_inode *src_ip, struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip); -/* - * xfs_iget.c prototypes. - */ - -#define IGET_CREATE 1 - -void xfs_ihash_init(struct xfs_mount *); -void xfs_ihash_free(struct xfs_mount *); -void xfs_chash_init(struct xfs_mount *); -void xfs_chash_free(struct xfs_mount *); -xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t, - struct xfs_trans *); -void xfs_inode_lock_init(xfs_inode_t *, struct bhv_vnode *); -int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - uint, uint, xfs_inode_t **, xfs_daddr_t); -void xfs_iput(xfs_inode_t *, uint); -void xfs_iput_new(xfs_inode_t *, uint); void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -void xfs_iflock(xfs_inode_t *); -int xfs_iflock_nowait(xfs_inode_t *); -uint xfs_ilock_map_shared(xfs_inode_t *); -void xfs_iunlock_map_shared(xfs_inode_t *, uint); -void xfs_ifunlock(xfs_inode_t *); -void xfs_ireclaim(xfs_inode_t *); -int xfs_finish_reclaim(xfs_inode_t *, int, int); -int xfs_finish_reclaim_all(struct xfs_mount *, int); +int xfs_isilocked(xfs_inode_t *, uint); +uint xfs_ilock_data_map_shared(struct xfs_inode *); +uint xfs_ilock_attr_map_shared(struct xfs_inode *); +int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_buf **, xfs_inode_t **); -/* - * xfs_inode.c prototypes. - */ -int xfs_itobp(struct xfs_mount *, struct xfs_trans *, - xfs_inode_t *, xfs_dinode_t **, struct xfs_buf **, - xfs_daddr_t, uint); -int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, - xfs_inode_t **, xfs_daddr_t); -int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); -int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t, - xfs_nlink_t, xfs_dev_t, struct cred *, xfs_prid_t, - int, struct xfs_buf **, boolean_t *, xfs_inode_t **); -void xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, - int); uint xfs_ip2xflags(struct xfs_inode *); -uint xfs_dic2xflags(struct xfs_dinode_core *); +uint xfs_dic2xflags(struct xfs_dinode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_bmap_free *); -void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); -int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, - xfs_fsize_t, int, int); +int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, + int, xfs_fsize_t); int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); -int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *); -void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *, - xfs_fsize_t, int); - -void xfs_idestroy_fork(xfs_inode_t *, int); -void xfs_idestroy(xfs_inode_t *); -void xfs_idata_realloc(xfs_inode_t *, int, int); -void xfs_iextract(xfs_inode_t *); + void xfs_iext_realloc(xfs_inode_t *, int, int); -void xfs_iroot_realloc(xfs_inode_t *, int, int); -void xfs_ipin(xfs_inode_t *); -void xfs_iunpin(xfs_inode_t *); -int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int); -int xfs_iflush(xfs_inode_t *, uint); -void xfs_iflush_all(struct xfs_mount *); -int xfs_iaccess(xfs_inode_t *, mode_t, cred_t *); -uint xfs_iroundup(uint); -void xfs_ichgtime(xfs_inode_t *, int); -xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); -void xfs_lock_inodes(xfs_inode_t **, int, int, uint); - -xfs_inode_t *xfs_vtoi(struct bhv_vnode *vp); - -void xfs_synchronize_atime(xfs_inode_t *); - -xfs_bmbt_rec_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t, - xfs_bmbt_irec_t *); -void xfs_iext_add(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_add_indirect_multi(xfs_ifork_t *, int, xfs_extnum_t, int); -void xfs_iext_remove(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); -void xfs_iext_realloc_direct(xfs_ifork_t *, int); -void xfs_iext_realloc_indirect(xfs_ifork_t *, int); -void xfs_iext_indirect_to_direct(xfs_ifork_t *); -void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); -void xfs_iext_inline_to_direct(xfs_ifork_t *, int); -void xfs_iext_destroy(xfs_ifork_t *); -xfs_bmbt_rec_t *xfs_iext_bno_to_ext(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_bno_to_irec(xfs_ifork_t *, xfs_fileoff_t, int *); -xfs_ext_irec_t *xfs_iext_idx_to_irec(xfs_ifork_t *, xfs_extnum_t *, int *, int); -void xfs_iext_irec_init(xfs_ifork_t *); -xfs_ext_irec_t *xfs_iext_irec_new(xfs_ifork_t *, int); -void xfs_iext_irec_remove(xfs_ifork_t *, int); -void xfs_iext_irec_compact(xfs_ifork_t *); -void xfs_iext_irec_compact_pages(xfs_ifork_t *); -void xfs_iext_irec_compact_full(xfs_ifork_t *); -void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int); +void xfs_iunpin_wait(xfs_inode_t *); #define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) -#ifdef DEBUG -void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t); -#else /* DEBUG */ -#define xfs_isize_check(mp, ip, isize) -#endif /* DEBUG */ +int xfs_iflush(struct xfs_inode *, struct xfs_buf **); +void xfs_lock_inodes(xfs_inode_t **, int, uint); +void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); -#if defined(DEBUG) -void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); -#else -#define xfs_inobp_check(mp, bp) -#endif /* DEBUG */ +xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); -extern struct kmem_zone *xfs_chashlist_zone; -extern struct kmem_zone *xfs_ifork_zone; -extern struct kmem_zone *xfs_inode_zone; -extern struct kmem_zone *xfs_ili_zone; +int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, + xfs_nlink_t, xfs_dev_t, prid_t, int, + struct xfs_inode **, int *); +int xfs_droplink(struct xfs_trans *, struct xfs_inode *); +int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); -#endif /* __KERNEL__ */ +/* from xfs_file.c */ +int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +int xfs_iozero(struct xfs_inode *, loff_t, size_t); + + +#define IHOLD(ip) \ +do { \ + ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ + ihold(VFS_I(ip)); \ + trace_xfs_ihold(ip, _THIS_IP_); \ +} while (0) + +#define IRELE(ip) \ +do { \ + trace_xfs_irele(ip, _THIS_IP_); \ + iput(VFS_I(ip)); \ +} while (0) + +extern struct kmem_zone *xfs_inode_zone; #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c new file mode 100644 index 00000000000..cb35ae41d4a --- /dev/null +++ b/fs/xfs/xfs_inode_buf.c @@ -0,0 +1,479 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_cksum.h" +#include "xfs_icache.h" +#include "xfs_trans.h" +#include "xfs_ialloc.h" +#include "xfs_dinode.h" + +/* + * Check that none of the inode's in the buffer have a next + * unlinked field of 0. + */ +#if defined(DEBUG) +void +xfs_inobp_check( + xfs_mount_t *mp, + xfs_buf_t *bp) +{ + int i; + int j; + xfs_dinode_t *dip; + + j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; + + for (i = 0; i < j; i++) { + dip = (xfs_dinode_t *)xfs_buf_offset(bp, + i * mp->m_sb.sb_inodesize); + if (!dip->di_next_unlinked) { + xfs_alert(mp, + "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", + i, (long long)bp->b_bn); + } + } +} +#endif + +/* + * If we are doing readahead on an inode buffer, we might be in log recovery + * reading an inode allocation buffer that hasn't yet been replayed, and hence + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * + * If the readahead buffer is invalid, we don't want to mark it with an error, + * but we do want to clear the DONE status of the buffer so that a followup read + * will re-read it from disk. This will ensure that we don't get an unnecessary + * warnings during log recovery and we don't get unnecssary panics on debug + * kernels. + */ +static void +xfs_inode_buf_verify( + struct xfs_buf *bp, + bool readahead) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + int i; + int ni; + + /* + * Validate the magic number and version of every inode in the buffer + */ + ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; + for (i = 0; i < ni; i++) { + int di_ok; + xfs_dinode_t *dip; + + dip = (struct xfs_dinode *)xfs_buf_offset(bp, + (i << mp->m_sb.sb_inodelog)); + di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && + XFS_DINODE_GOOD_VERSION(dip->di_version); + if (unlikely(XFS_TEST_ERROR(!di_ok, mp, + XFS_ERRTAG_ITOBP_INOTOBP, + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; + return; + } + + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); +#ifdef DEBUG + xfs_alert(mp, + "bad inode magic/vsn daddr %lld #%d (magic=%x)", + (unsigned long long)bp->b_bn, i, + be16_to_cpu(dip->di_magic)); +#endif + } + } + xfs_inobp_check(mp, bp); +} + + +static void +xfs_inode_buf_read_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +static void +xfs_inode_buf_readahead_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, true); +} + +static void +xfs_inode_buf_write_verify( + struct xfs_buf *bp) +{ + xfs_inode_buf_verify(bp, false); +} + +const struct xfs_buf_ops xfs_inode_buf_ops = { + .verify_read = xfs_inode_buf_read_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + +const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .verify_read = xfs_inode_buf_readahead_verify, + .verify_write = xfs_inode_buf_write_verify, +}; + + +/* + * This routine is called to map an inode to the buffer containing the on-disk + * version of the inode. It returns a pointer to the buffer containing the + * on-disk inode in the bpp parameter, and in the dipp parameter it returns a + * pointer to the on-disk inode within that buffer. + * + * If a non-zero error is returned, then the contents of bpp and dipp are + * undefined. + */ +int +xfs_imap_to_bp( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_imap *imap, + struct xfs_dinode **dipp, + struct xfs_buf **bpp, + uint buf_flags, + uint iget_flags) +{ + struct xfs_buf *bp; + int error; + + buf_flags |= XBF_UNMAPPED; + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, + (int)imap->im_len, buf_flags, &bp, + &xfs_inode_buf_ops); + if (error) { + if (error == EAGAIN) { + ASSERT(buf_flags & XBF_TRYLOCK); + return error; + } + + if (error == EFSCORRUPTED && + (iget_flags & XFS_IGET_UNTRUSTED)) + return XFS_ERROR(EINVAL); + + xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", + __func__, error); + return error; + } + + *bpp = bp; + *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + return 0; +} + +void +xfs_dinode_from_disk( + xfs_icdinode_t *to, + xfs_dinode_t *from) +{ + to->di_magic = be16_to_cpu(from->di_magic); + to->di_mode = be16_to_cpu(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = be16_to_cpu(from->di_onlink); + to->di_uid = be32_to_cpu(from->di_uid); + to->di_gid = be32_to_cpu(from->di_gid); + to->di_nlink = be32_to_cpu(from->di_nlink); + to->di_projid_lo = be16_to_cpu(from->di_projid_lo); + to->di_projid_hi = be16_to_cpu(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_flushiter = be16_to_cpu(from->di_flushiter); + to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec); + to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec); + to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec); + to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec); + to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec); + to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec); + to->di_size = be64_to_cpu(from->di_size); + to->di_nblocks = be64_to_cpu(from->di_nblocks); + to->di_extsize = be32_to_cpu(from->di_extsize); + to->di_nextents = be32_to_cpu(from->di_nextents); + to->di_anextents = be16_to_cpu(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = be32_to_cpu(from->di_dmevmask); + to->di_dmstate = be16_to_cpu(from->di_dmstate); + to->di_flags = be16_to_cpu(from->di_flags); + to->di_gen = be32_to_cpu(from->di_gen); + + if (to->di_version == 3) { + to->di_changecount = be64_to_cpu(from->di_changecount); + to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec); + to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec); + to->di_flags2 = be64_to_cpu(from->di_flags2); + to->di_ino = be64_to_cpu(from->di_ino); + to->di_lsn = be64_to_cpu(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + } +} + +void +xfs_dinode_to_disk( + xfs_dinode_t *to, + xfs_icdinode_t *from) +{ + to->di_magic = cpu_to_be16(from->di_magic); + to->di_mode = cpu_to_be16(from->di_mode); + to->di_version = from ->di_version; + to->di_format = from->di_format; + to->di_onlink = cpu_to_be16(from->di_onlink); + to->di_uid = cpu_to_be32(from->di_uid); + to->di_gid = cpu_to_be32(from->di_gid); + to->di_nlink = cpu_to_be32(from->di_nlink); + to->di_projid_lo = cpu_to_be16(from->di_projid_lo); + to->di_projid_hi = cpu_to_be16(from->di_projid_hi); + memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); + to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec); + to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec); + to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec); + to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec); + to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec); + to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec); + to->di_size = cpu_to_be64(from->di_size); + to->di_nblocks = cpu_to_be64(from->di_nblocks); + to->di_extsize = cpu_to_be32(from->di_extsize); + to->di_nextents = cpu_to_be32(from->di_nextents); + to->di_anextents = cpu_to_be16(from->di_anextents); + to->di_forkoff = from->di_forkoff; + to->di_aformat = from->di_aformat; + to->di_dmevmask = cpu_to_be32(from->di_dmevmask); + to->di_dmstate = cpu_to_be16(from->di_dmstate); + to->di_flags = cpu_to_be16(from->di_flags); + to->di_gen = cpu_to_be32(from->di_gen); + + if (from->di_version == 3) { + to->di_changecount = cpu_to_be64(from->di_changecount); + to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec); + to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec); + to->di_flags2 = cpu_to_be64(from->di_flags2); + to->di_ino = cpu_to_be64(from->di_ino); + to->di_lsn = cpu_to_be64(from->di_lsn); + memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); + uuid_copy(&to->di_uuid, &from->di_uuid); + to->di_flushiter = 0; + } else { + to->di_flushiter = cpu_to_be16(from->di_flushiter); + } +} + +static bool +xfs_dinode_verify( + struct xfs_mount *mp, + struct xfs_inode *ip, + struct xfs_dinode *dip) +{ + if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) + return false; + + /* only version 3 or greater inodes are extensively verified here */ + if (dip->di_version < 3) + return true; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF)) + return false; + if (be64_to_cpu(dip->di_ino) != ip->i_ino) + return false; + if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid)) + return false; + return true; +} + +void +xfs_dinode_calc_crc( + struct xfs_mount *mp, + struct xfs_dinode *dip) +{ + __uint32_t crc; + + if (dip->di_version < 3) + return; + + ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + XFS_DINODE_CRC_OFF); + dip->di_crc = xfs_end_cksum(crc); +} + +/* + * Read the disk inode attributes into the in-core inode structure. + * + * For version 5 superblocks, if we are initialising a new inode and we are not + * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new + * inode core with a random generation number. If we are keeping inodes around, + * we need to read the inode cluster to get the existing generation number off + * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode + * format) then log recovery is dependent on the di_flushiter field being + * initialised from the current on-disk value and hence we must also read the + * inode off disk. + */ +int +xfs_iread( + xfs_mount_t *mp, + xfs_trans_t *tp, + xfs_inode_t *ip, + uint iget_flags) +{ + xfs_buf_t *bp; + xfs_dinode_t *dip; + int error; + + /* + * Fill in the location information in the in-core inode. + */ + error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags); + if (error) + return error; + + /* shortcut IO on inode allocation if possible */ + if ((iget_flags & XFS_IGET_CREATE) && + xfs_sb_version_hascrc(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_IKEEP)) { + /* initialise the on-disk inode core */ + memset(&ip->i_d, 0, sizeof(ip->i_d)); + ip->i_d.di_magic = XFS_DINODE_MAGIC; + ip->i_d.di_gen = prandom_u32(); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + ip->i_d.di_version = 3; + ip->i_d.di_ino = ip->i_ino; + uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid); + } else + ip->i_d.di_version = 2; + return 0; + } + + /* + * Get pointers to the on-disk inode and the buffer containing it. + */ + error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags); + if (error) + return error; + + /* even unallocated inodes are verified */ + if (!xfs_dinode_verify(mp, ip, dip)) { + xfs_alert(mp, "%s: validation failed for inode %lld failed", + __func__, ip->i_ino); + + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); + error = XFS_ERROR(EFSCORRUPTED); + goto out_brelse; + } + + /* + * If the on-disk inode is already linked to a directory + * entry, copy all of the inode into the in-core inode. + * xfs_iformat_fork() handles copying in the inode format + * specific information. + * Otherwise, just get the truly permanent information. + */ + if (dip->di_mode) { + xfs_dinode_from_disk(&ip->i_d, dip); + error = xfs_iformat_fork(ip, dip); + if (error) { +#ifdef DEBUG + xfs_alert(mp, "%s: xfs_iformat() returned error %d", + __func__, error); +#endif /* DEBUG */ + goto out_brelse; + } + } else { + /* + * Partial initialisation of the in-core inode. Just the bits + * that xfs_ialloc won't overwrite or relies on being correct. + */ + ip->i_d.di_magic = be16_to_cpu(dip->di_magic); + ip->i_d.di_version = dip->di_version; + ip->i_d.di_gen = be32_to_cpu(dip->di_gen); + ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter); + + if (dip->di_version == 3) { + ip->i_d.di_ino = be64_to_cpu(dip->di_ino); + uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid); + } + + /* + * Make sure to pull in the mode here as well in + * case the inode is released without being used. + * This ensures that xfs_inactive() will see that + * the inode is already free and not try to mess + * with the uninitialized part of it. + */ + ip->i_d.di_mode = 0; + } + + /* + * Automatically convert version 1 inode formats in memory to version 2 + * inode format. If the inode is modified, it will get logged and + * rewritten as a version 2 inode. We can do this because we set the + * superblock feature bit for v2 inodes unconditionally during mount + * and it means the reast of the code can assume the inode version is 2 + * or higher. + */ + if (ip->i_d.di_version == 1) { + ip->i_d.di_version = 2; + memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); + ip->i_d.di_nlink = ip->i_d.di_onlink; + ip->i_d.di_onlink = 0; + xfs_set_projid(ip, 0); + } + + ip->i_delayed_blks = 0; + + /* + * Mark the buffer containing the inode as something to keep + * around for a while. This helps to keep recently accessed + * meta-data in-core longer. + */ + xfs_buf_set_ref(bp, XFS_INO_REF); + + /* + * Use xfs_trans_brelse() to release the buffer containing the on-disk + * inode, because it was acquired with xfs_trans_read_buf() in + * xfs_imap_to_bp() above. If tp is NULL, this is just a normal + * brelse(). If we're within a transaction, then xfs_trans_brelse() + * will only release the buffer if it is not dirty within the + * transaction. It will be OK to release the buffer in this case, + * because inodes on disk are never destroyed and we will be locking the + * new in-core inode before putting it in the cache where other + * processes can find it. Thus we don't have to worry about the inode + * being changed just because we released the buffer. + */ + out_brelse: + xfs_trans_brelse(tp, bp); + return error; +} diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h new file mode 100644 index 00000000000..9308c47f2a5 --- /dev/null +++ b/fs/xfs/xfs_inode_buf.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_BUF_H__ +#define __XFS_INODE_BUF_H__ + +struct xfs_inode; +struct xfs_dinode; +struct xfs_icdinode; + +/* + * Inode location information. Stored in the inode and passed to + * xfs_imap_to_bp() to get a buffer and dinode for a given inode. + */ +struct xfs_imap { + xfs_daddr_t im_blkno; /* starting BB of inode chunk */ + ushort im_len; /* length in BBs of inode chunk */ + ushort im_boffset; /* inode offset in block in bytes */ +}; + +int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, + struct xfs_imap *, struct xfs_dinode **, + struct xfs_buf **, uint, uint); +int xfs_iread(struct xfs_mount *, struct xfs_trans *, + struct xfs_inode *, uint); +void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from); +void xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from); + +#if defined(DEBUG) +void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *); +#else +#define xfs_inobp_check(mp, bp) +#endif /* DEBUG */ + +#endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c new file mode 100644 index 00000000000..b031e8d0d92 --- /dev/null +++ b/fs/xfs/xfs_inode_fork.c @@ -0,0 +1,1906 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/log2.h> + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_attr_sf.h" +#include "xfs_dinode.h" + +kmem_zone_t *xfs_ifork_zone; + +STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); +STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); +STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); + +#ifdef DEBUG +/* + * Make sure that the extents in the given memory buffer + * are valid. + */ +void +xfs_validate_extents( + xfs_ifork_t *ifp, + int nrecs, + xfs_exntfmt_t fmt) +{ + xfs_bmbt_irec_t irec; + xfs_bmbt_rec_host_t rec; + int i; + + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + rec.l0 = get_unaligned(&ep->l0); + rec.l1 = get_unaligned(&ep->l1); + xfs_bmbt_get_all(&rec, &irec); + if (fmt == XFS_EXTFMT_NOSTATE) + ASSERT(irec.br_state == XFS_EXT_NORM); + } +} +#else /* DEBUG */ +#define xfs_validate_extents(ifp, nrecs, fmt) +#endif /* DEBUG */ + + +/* + * Move inode type and inode format specific information from the + * on-disk inode to the in-core inode. For fifos, devs, and sockets + * this means set if_rdev to the proper value. For files, directories, + * and symlinks this means to bring in the in-line data or extent + * pointers. For a file in B-tree format, only the root is immediately + * brought in-core. The rest will be in-lined in if_extents when it + * is first referenced (see xfs_iread_extents()). + */ +int +xfs_iformat_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip) +{ + xfs_attr_shortform_t *atp; + int size; + int error = 0; + xfs_fsize_t di_size; + + if (unlikely(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents) > + be64_to_cpu(dip->di_nblocks))) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", + (unsigned long long)ip->i_ino, + (int)(be32_to_cpu(dip->di_nextents) + + be16_to_cpu(dip->di_anextents)), + (unsigned long long) + be64_to_cpu(dip->di_nblocks)); + XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { + xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.", + (unsigned long long)ip->i_ino, + dip->di_forkoff); + XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_warn(ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: + if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { + XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ip->i_d.di_size = 0; + ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); + break; + + case S_IFREG: + case S_IFLNK: + case S_IFDIR: + switch (dip->di_format) { + case XFS_DINODE_FMT_LOCAL: + /* + * no local regular files yet + */ + if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (local format for regular file).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(4)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + di_size = be64_to_cpu(dip->di_size); + if (unlikely(di_size < 0 || + di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %Ld for local inode).", + (unsigned long long) ip->i_ino, + (long long) di_size); + XFS_CORRUPTION_ERROR("xfs_iformat(5)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + size = (int)di_size; + error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK); + break; + default: + XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + break; + + default: + XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + if (error) { + return error; + } + if (!XFS_DFORK_Q(dip)) + return 0; + + ASSERT(ip->i_afp == NULL); + ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); + + switch (dip->di_aformat) { + case XFS_DINODE_FMT_LOCAL: + atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); + size = be16_to_cpu(atp->hdr.totsize); + + if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad attr fork size %Ld).", + (unsigned long long) ip->i_ino, + (long long) size); + XFS_CORRUPTION_ERROR("xfs_iformat(8)", + XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); + break; + case XFS_DINODE_FMT_EXTENTS: + error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK); + break; + case XFS_DINODE_FMT_BTREE: + error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); + break; + default: + error = XFS_ERROR(EFSCORRUPTED); + break; + } + if (error) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + xfs_idestroy_fork(ip, XFS_DATA_FORK); + } + return error; +} + +/* + * The file is in-lined in the on-disk inode. + * If it fits into if_inline_data, then copy + * it there, otherwise allocate a buffer for it + * and copy the data there. Either way, set + * if_data to point at the data. + * If we allocate a buffer for the data, make + * sure that its size is a multiple of 4 and + * record the real size in i_real_bytes. + */ +STATIC int +xfs_iformat_local( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork, + int size) +{ + xfs_ifork_t *ifp; + int real_size; + + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, + "corrupt inode %Lu (bad size %d for local fork, size = %d).", + (unsigned long long) ip->i_ino, size, + XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); + XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + ifp = XFS_IFORK_PTR(ip, whichfork); + real_size = 0; + if (size == 0) + ifp->if_u1.if_data = NULL; + else if (size <= sizeof(ifp->if_u2.if_inline_data)) + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + else { + real_size = roundup(size, 4); + ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS); + } + ifp->if_bytes = size; + ifp->if_real_bytes = real_size; + if (size) + memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFINLINE; + return 0; +} + +/* + * The file consists of a set of extents all + * of which fit into the on-disk inode. + * If there are few enough extents to fit into + * the if_inline_ext, then copy them there. + * Otherwise allocate a buffer for them and copy + * them into it. Either way, set if_extents + * to point at the extents. + */ +STATIC int +xfs_iformat_extents( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + xfs_bmbt_rec_t *dp; + xfs_ifork_t *ifp; + int nex; + int size; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + nex = XFS_DFORK_NEXTENTS(dip, whichfork); + size = nex * (uint)sizeof(xfs_bmbt_rec_t); + + /* + * If the number of extents is unreasonable, then something + * is wrong and we just bail out rather than crash in + * kmem_alloc() or memcpy() below. + */ + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", + (unsigned long long) ip->i_ino, nex); + XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, + ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_real_bytes = 0; + if (nex == 0) + ifp->if_u1.if_extents = NULL; + else if (nex <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + else + xfs_iext_add(ifp, 0, nex); + + ifp->if_bytes = size; + if (size) { + dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); + xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); + for (i = 0; i < nex; i++, dp++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + ep->l0 = get_unaligned_be64(&dp->l0); + ep->l1 = get_unaligned_be64(&dp->l1); + } + XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); + if (whichfork != XFS_DATA_FORK || + XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) + if (unlikely(xfs_check_nostate_extents( + ifp, 0, nex))) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + } + ifp->if_flags |= XFS_IFEXTENTS; + return 0; +} + +/* + * The file has too many extents to fit into + * the inode, so they are in B-tree format. + * Allocate a buffer for the root of the B-tree + * and copy the root into it. The i_extents + * field will remain NULL until all of the + * extents are read in (when they are needed). + */ +STATIC int +xfs_iformat_btree( + xfs_inode_t *ip, + xfs_dinode_t *dip, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_bmdr_block_t *dfp; + xfs_ifork_t *ifp; + /* REFERENCED */ + int nrecs; + int size; + + ifp = XFS_IFORK_PTR(ip, whichfork); + dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork); + size = XFS_BMAP_BROOT_SPACE(mp, dfp); + nrecs = be16_to_cpu(dfp->bb_numrecs); + + /* + * blow out if -- fork has less extents than can fit in + * fork (fork shouldn't be a btree format), root btree + * block has more records than can fit into the fork, + * or the number of extents is greater than the number of + * blocks. + */ + if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= + XFS_IFORK_MAXEXT(ip, whichfork) || + XFS_BMDR_SPACE_CALC(nrecs) > + XFS_DFORK_SIZE(dip, mp, whichfork) || + XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { + xfs_warn(mp, "corrupt inode %Lu (btree).", + (unsigned long long) ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, + mp, dip); + return XFS_ERROR(EFSCORRUPTED); + } + + ifp->if_broot_bytes = size; + ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS); + ASSERT(ifp->if_broot != NULL); + /* + * Copy and convert from the on-disk structure + * to the in-memory structure. + */ + xfs_bmdr_to_bmbt(ip, dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork), + ifp->if_broot, size); + ifp->if_flags &= ~XFS_IFEXTENTS; + ifp->if_flags |= XFS_IFBROOT; + + return 0; +} + +/* + * Read in extents from a btree-format inode. + * Allocate and fill in if_extents. Real work is done in xfs_bmap.c. + */ +int +xfs_iread_extents( + xfs_trans_t *tp, + xfs_inode_t *ip, + int whichfork) +{ + int error; + xfs_ifork_t *ifp; + xfs_extnum_t nextents; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { + XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, + ip->i_mount); + return XFS_ERROR(EFSCORRUPTED); + } + nextents = XFS_IFORK_NEXTENTS(ip, whichfork); + ifp = XFS_IFORK_PTR(ip, whichfork); + + /* + * We know that the size is valid (it's checked in iformat_btree) + */ + ifp->if_bytes = ifp->if_real_bytes = 0; + ifp->if_flags |= XFS_IFEXTENTS; + xfs_iext_add(ifp, 0, nextents); + error = xfs_bmap_read_extents(tp, ip, whichfork); + if (error) { + xfs_iext_destroy(ifp); + ifp->if_flags &= ~XFS_IFEXTENTS; + return error; + } + xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); + return 0; +} +/* + * Reallocate the space for if_broot based on the number of records + * being added or deleted as indicated in rec_diff. Move the records + * and pointers in if_broot to fit the new size. When shrinking this + * will eliminate holes between the records and pointers created by + * the caller. When growing this will create holes to be filled in + * by the caller. + * + * The caller must not request to add more records than would fit in + * the on-disk inode root. If the if_broot is currently NULL, then + * if we are adding records, one will be allocated. The caller must also + * not request that the number of records go below zero, although + * it can go to zero. + * + * ip -- the inode whose if_broot area is changing + * ext_diff -- the change in the number of records, positive or negative, + * requested for the if_broot array. + */ +void +xfs_iroot_realloc( + xfs_inode_t *ip, + int rec_diff, + int whichfork) +{ + struct xfs_mount *mp = ip->i_mount; + int cur_max; + xfs_ifork_t *ifp; + struct xfs_btree_block *new_broot; + int new_max; + size_t new_size; + char *np; + char *op; + + /* + * Handle the degenerate case quietly. + */ + if (rec_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (rec_diff > 0) { + /* + * If there wasn't any memory allocated before, just + * allocate it now and get out. + */ + if (ifp->if_broot_bytes == 0) { + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff); + ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + ifp->if_broot_bytes = (int)new_size; + return; + } + + /* + * If there is already an existing if_broot, then we need + * to realloc() it and shift the pointers to their new + * location. The records don't change location because + * they are kept butted up against the btree block header. + */ + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + ifp->if_broot = kmem_realloc(ifp->if_broot, new_size, + XFS_BMAP_BROOT_SPACE_CALC(mp, cur_max), + KM_SLEEP | KM_NOFS); + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + (int)new_size); + ifp->if_broot_bytes = (int)new_size; + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); + return; + } + + /* + * rec_diff is less than 0. In this case, we are shrinking the + * if_broot buffer. It must already exist. If we go to zero + * records, just get rid of the root and clear the status bit. + */ + ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0)); + cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0); + new_max = cur_max + rec_diff; + ASSERT(new_max >= 0); + if (new_max > 0) + new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max); + else + new_size = 0; + if (new_size > 0) { + new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS); + /* + * First copy over the btree block header. + */ + memcpy(new_broot, ifp->if_broot, + XFS_BMBT_BLOCK_LEN(ip->i_mount)); + } else { + new_broot = NULL; + ifp->if_flags &= ~XFS_IFBROOT; + } + + /* + * Only copy the records and pointers if there are any. + */ + if (new_max > 0) { + /* + * First copy the records. + */ + op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); + np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); + memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); + + /* + * Then copy the pointers. + */ + op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1, + ifp->if_broot_bytes); + np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, + (int)new_size); + memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); + } + kmem_free(ifp->if_broot); + ifp->if_broot = new_broot; + ifp->if_broot_bytes = (int)new_size; + if (ifp->if_broot) + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + return; +} + + +/* + * This is called when the amount of space needed for if_data + * is increased or decreased. The change in size is indicated by + * the number of bytes that need to be added or deleted in the + * byte_diff parameter. + * + * If the amount of space needed has decreased below the size of the + * inline buffer, then switch to using the inline buffer. Otherwise, + * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer + * to what is needed. + * + * ip -- the inode whose if_data area is changing + * byte_diff -- the change in the number of bytes, positive or negative, + * requested for the if_data array. + */ +void +xfs_idata_realloc( + xfs_inode_t *ip, + int byte_diff, + int whichfork) +{ + xfs_ifork_t *ifp; + int new_size; + int real_size; + + if (byte_diff == 0) { + return; + } + + ifp = XFS_IFORK_PTR(ip, whichfork); + new_size = (int)ifp->if_bytes + byte_diff; + ASSERT(new_size >= 0); + + if (new_size == 0) { + if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + kmem_free(ifp->if_u1.if_data); + } + ifp->if_u1.if_data = NULL; + real_size = 0; + } else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) { + /* + * If the valid extents/data can fit in if_inline_ext/data, + * copy them from the malloc'd vector and free it. + */ + if (ifp->if_u1.if_data == NULL) { + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + ASSERT(ifp->if_real_bytes != 0); + memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data, + new_size); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = ifp->if_u2.if_inline_data; + } + real_size = 0; + } else { + /* + * Stuck with malloc/realloc. + * For inline data, the underlying buffer must be + * a multiple of 4 bytes in size so that it can be + * logged and stay on word boundaries. We enforce + * that here. + */ + real_size = roundup(new_size, 4); + if (ifp->if_u1.if_data == NULL) { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + } else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) { + /* + * Only do the realloc if the underlying size + * is really changing. + */ + if (ifp->if_real_bytes != real_size) { + ifp->if_u1.if_data = + kmem_realloc(ifp->if_u1.if_data, + real_size, + ifp->if_real_bytes, + KM_SLEEP | KM_NOFS); + } + } else { + ASSERT(ifp->if_real_bytes == 0); + ifp->if_u1.if_data = kmem_alloc(real_size, + KM_SLEEP | KM_NOFS); + memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data, + ifp->if_bytes); + } + } + ifp->if_real_bytes = real_size; + ifp->if_bytes = new_size; + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); +} + +void +xfs_idestroy_fork( + xfs_inode_t *ip, + int whichfork) +{ + xfs_ifork_t *ifp; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (ifp->if_broot != NULL) { + kmem_free(ifp->if_broot); + ifp->if_broot = NULL; + } + + /* + * If the format is local, then we can't have an extents + * array so just look for an inline data array. If we're + * not local then we may or may not have an extents list, + * so check and free it up if we do. + */ + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) && + (ifp->if_u1.if_data != NULL)) { + ASSERT(ifp->if_real_bytes != 0); + kmem_free(ifp->if_u1.if_data); + ifp->if_u1.if_data = NULL; + ifp->if_real_bytes = 0; + } + } else if ((ifp->if_flags & XFS_IFEXTENTS) && + ((ifp->if_flags & XFS_IFEXTIREC) || + ((ifp->if_u1.if_extents != NULL) && + (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) { + ASSERT(ifp->if_real_bytes != 0); + xfs_iext_destroy(ifp); + } + ASSERT(ifp->if_u1.if_extents == NULL || + ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext); + ASSERT(ifp->if_real_bytes == 0); + if (whichfork == XFS_ATTR_FORK) { + kmem_zone_free(xfs_ifork_zone, ip->i_afp); + ip->i_afp = NULL; + } +} + +/* + * Convert in-core extents to on-disk form + * + * For either the data or attr fork in extent format, we need to endian convert + * the in-core extent as we place them into the on-disk inode. + * + * In the case of the data fork, the in-core and on-disk fork sizes can be + * different due to delayed allocation extents. We only copy on-disk extents + * here, so callers must always use the physical fork size to determine the + * size of the buffer passed to this routine. We will return the size actually + * used. + */ +int +xfs_iextents_copy( + xfs_inode_t *ip, + xfs_bmbt_rec_t *dp, + int whichfork) +{ + int copied; + int i; + xfs_ifork_t *ifp; + int nrecs; + xfs_fsblock_t start_block; + + ifp = XFS_IFORK_PTR(ip, whichfork); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(ifp->if_bytes > 0); + + nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); + ASSERT(nrecs > 0); + + /* + * There are some delayed allocation extents in the + * inode, so copy the extents one at a time and skip + * the delayed ones. There must be at least one + * non-delayed extent. + */ + copied = 0; + for (i = 0; i < nrecs; i++) { + xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + start_block = xfs_bmbt_get_startblock(ep); + if (isnullstartblock(start_block)) { + /* + * It's a delayed allocation extent, so skip it. + */ + continue; + } + + /* Translate to on disk format */ + put_unaligned_be64(ep->l0, &dp->l0); + put_unaligned_be64(ep->l1, &dp->l1); + dp++; + copied++; + } + ASSERT(copied != 0); + xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); + + return (copied * (uint)sizeof(xfs_bmbt_rec_t)); +} + +/* + * Each of the following cases stores data into the same region + * of the on-disk inode, so only one of them can be valid at + * any given time. While it is possible to have conflicting formats + * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is + * in EXTENTS format, this can only happen when the fork has + * changed formats after being modified but before being flushed. + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +void +xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, + xfs_inode_log_item_t *iip, + int whichfork) +{ + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = + { XFS_ILOG_DDATA, XFS_ILOG_ADATA }; + static const short extflag[2] = + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) + return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, + * for the attribute fork. + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); + return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); + ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork)); + memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes); + } + break; + + case XFS_DINODE_FMT_EXTENTS: + ASSERT((ifp->if_flags & XFS_IFEXTENTS) || + !(iip->ili_fields & extflag[whichfork])); + if ((iip->ili_fields & extflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(xfs_iext_get_ext(ifp, 0)); + ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0); + (void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp, + whichfork); + } + break; + + case XFS_DINODE_FMT_BTREE: + if ((iip->ili_fields & brootflag[whichfork]) && + (ifp->if_broot_bytes > 0)) { + ASSERT(ifp->if_broot != NULL); + ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= + XFS_IFORK_SIZE(ip, whichfork)); + xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes, + (xfs_bmdr_block_t *)cp, + XFS_DFORK_SIZE(dip, mp, whichfork)); + } + break; + + case XFS_DINODE_FMT_DEV: + if (iip->ili_fields & XFS_ILOG_DEV) { + ASSERT(whichfork == XFS_DATA_FORK); + xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev); + } + break; + + case XFS_DINODE_FMT_UUID: + if (iip->ili_fields & XFS_ILOG_UUID) { + ASSERT(whichfork == XFS_DATA_FORK); + memcpy(XFS_DFORK_DPTR(dip), + &ip->i_df.if_u2.if_uuid, + sizeof(uuid_t)); + } + break; + + default: + ASSERT(0); + break; + } +} + +/* + * Return a pointer to the extent record at file index idx. + */ +xfs_bmbt_rec_host_t * +xfs_iext_get_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx) /* index of target extent */ +{ + ASSERT(idx >= 0); + ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + + if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { + return ifp->if_u1.if_ext_irec->er_extbuf; + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_ext_irec_t *erp; /* irec pointer */ + int erp_idx = 0; /* irec index */ + xfs_extnum_t page_idx = idx; /* ext index in target list */ + + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + return &erp->er_extbuf[page_idx]; + } else if (ifp->if_bytes) { + return &ifp->if_u1.if_extents[idx]; + } else { + return NULL; + } +} + +/* + * Insert new item(s) into the extent records for incore inode + * fork 'ifp'. 'count' new items are inserted at index 'idx'. + */ +void +xfs_iext_insert( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* starting index of new items */ + xfs_extnum_t count, /* number of inserted items */ + xfs_bmbt_irec_t *new, /* items to insert */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t i; /* extent record index */ + + trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_); + + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + xfs_iext_add(ifp, idx, count); + for (i = idx; i < idx + count; i++, new++) + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new); +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be increased. The ext_diff parameter stores the + * number of new extents being added and the idx parameter contains + * the extent index where the new extents will be added. If the new + * extents are being appended, then we just need to (re)allocate and + * initialize the space. Otherwise, if the new extents are being + * inserted into the middle of the existing entries, a bit more work + * is required to make room for the new extents to be inserted. The + * caller is responsible for filling in the new extent entries upon + * return. + */ +void +xfs_iext_add( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin adding exts */ + int ext_diff) /* number of extents to add */ +{ + int byte_diff; /* new bytes being added */ + int new_size; /* size of extents after adding */ + xfs_extnum_t nextents; /* number of extents in file */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT((idx >= 0) && (idx <= nextents)); + byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); + new_size = ifp->if_bytes + byte_diff; + /* + * If the new number of extents (nextents + ext_diff) + * fits inside the inode, then continue to use the inline + * extent buffer. + */ + if (nextents + ext_diff <= XFS_INLINE_EXTS) { + if (idx < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff], + &ifp->if_u2.if_inline_ext[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff); + } + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; + } + /* + * Otherwise use a linear (direct) extent list. + * If the extents are currently inside the inode, + * xfs_iext_realloc_direct will switch us from + * inline to direct extent allocation mode. + */ + else if (nextents + ext_diff <= XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, new_size); + if (idx < nextents) { + memmove(&ifp->if_u1.if_extents[idx + ext_diff], + &ifp->if_u1.if_extents[idx], + (nextents - idx) * sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u1.if_extents[idx], 0, byte_diff); + } + } + /* Indirection array */ + else { + xfs_ext_irec_t *erp; + int erp_idx = 0; + int page_idx = idx; + + ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS); + if (ifp->if_flags & XFS_IFEXTIREC) { + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1); + } else { + xfs_iext_irec_init(ifp); + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = ifp->if_u1.if_ext_irec; + } + /* Extents fit in target extent page */ + if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) { + if (page_idx < erp->er_extcount) { + memmove(&erp->er_extbuf[page_idx + ext_diff], + &erp->er_extbuf[page_idx], + (erp->er_extcount - page_idx) * + sizeof(xfs_bmbt_rec_t)); + memset(&erp->er_extbuf[page_idx], 0, byte_diff); + } + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + } + /* Insert a new extent page */ + else if (erp) { + xfs_iext_add_indirect_multi(ifp, + erp_idx, page_idx, ext_diff); + } + /* + * If extent(s) are being appended to the last page in + * the indirection array and the new extent(s) don't fit + * in the page, then erp is NULL and erp_idx is set to + * the next index needed in the indirection array. + */ + else { + uint count = ext_diff; + + while (count) { + erp = xfs_iext_irec_new(ifp, erp_idx); + erp->er_extcount = min(count, XFS_LINEAR_EXTS); + count -= erp->er_extcount; + if (count) + erp_idx++; + } + } + } + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being added to the indirection + * array and the new extents do not fit in the target extent list. The + * erp_idx parameter contains the irec index for the target extent list + * in the indirection array, and the idx parameter contains the extent + * index within the list. The number of extents being added is stored + * in the count parameter. + * + * |-------| |-------| + * | | | | idx - number of extents before idx + * | idx | | count | + * | | | | count - number of extents being inserted at idx + * |-------| |-------| + * | count | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_add_indirect_multi( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* target extent irec index */ + xfs_extnum_t idx, /* index within target list */ + int count) /* new extents being added */ +{ + int byte_diff; /* new bytes being added */ + xfs_ext_irec_t *erp; /* pointer to irec entry */ + xfs_extnum_t ext_diff; /* number of extents to add */ + xfs_extnum_t ext_cnt; /* new extents still needed */ + xfs_extnum_t nex2; /* extents after idx + count */ + xfs_bmbt_rec_t *nex2_ep = NULL; /* temp list for nex2 extents */ + int nlists; /* number of irec's (lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex2 = erp->er_extcount - idx; + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* + * Save second part of target extent list + * (all extents past */ + if (nex2) { + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS); + memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff); + erp->er_extcount -= nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2); + memset(&erp->er_extbuf[idx], 0, byte_diff); + } + + /* + * Add the new extents to the end of the target + * list, then allocate new irec record(s) and + * extent buffer(s) as needed to store the rest + * of the new extents. + */ + ext_cnt = count; + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount); + if (ext_diff) { + erp->er_extcount += ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + while (ext_cnt) { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS); + erp->er_extcount = ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff); + ext_cnt -= ext_diff; + } + + /* Add nex2 extents back to indirection array */ + if (nex2) { + xfs_extnum_t ext_avail; + int i; + + byte_diff = nex2 * sizeof(xfs_bmbt_rec_t); + ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; + i = 0; + /* + * If nex2 extents fit in the current page, append + * nex2_ep after the new extents. + */ + if (nex2 <= ext_avail) { + i = erp->er_extcount; + } + /* + * Otherwise, check if space is available in the + * next page. + */ + else if ((erp_idx < nlists - 1) && + (nex2 <= (ext_avail = XFS_LINEAR_EXTS - + ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) { + erp_idx++; + erp++; + /* Create a hole for nex2 extents */ + memmove(&erp->er_extbuf[nex2], erp->er_extbuf, + erp->er_extcount * sizeof(xfs_bmbt_rec_t)); + } + /* + * Final choice, create a new extent page for + * nex2 extents. + */ + else { + erp_idx++; + erp = xfs_iext_irec_new(ifp, erp_idx); + } + memmove(&erp->er_extbuf[i], nex2_ep, byte_diff); + kmem_free(nex2_ep); + erp->er_extcount += nex2; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2); + } +} + +/* + * This is called when the amount of space required for incore file + * extents needs to be decreased. The ext_diff parameter stores the + * number of extents to be removed and the idx parameter contains + * the extent index where the extents will be removed from. + * + * If the amount of space needed has decreased below the linear + * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous + * extent array. Otherwise, use kmem_realloc() to adjust the + * size to what is needed. + */ +void +xfs_iext_remove( + xfs_inode_t *ip, /* incore inode pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff, /* number of extents to remove */ + int state) /* type of extent conversion */ +{ + xfs_ifork_t *ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df; + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + trace_xfs_iext_remove(ip, idx, state, _RET_IP_); + + ASSERT(ext_diff > 0); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else if (ifp->if_flags & XFS_IFEXTIREC) { + xfs_iext_remove_indirect(ifp, idx, ext_diff); + } else if (ifp->if_real_bytes) { + xfs_iext_remove_direct(ifp, idx, ext_diff); + } else { + xfs_iext_remove_inline(ifp, idx, ext_diff); + } + ifp->if_bytes = new_size; +} + +/* + * This removes ext_diff extents from the inline buffer, beginning + * at extent index idx. + */ +void +xfs_iext_remove_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + int nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + ASSERT(idx < XFS_INLINE_EXTS); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(((nextents - ext_diff) > 0) && + (nextents - ext_diff) < XFS_INLINE_EXTS); + + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u2.if_inline_ext[idx], + &ifp->if_u2.if_inline_ext[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + } else { + memset(&ifp->if_u2.if_inline_ext[idx], 0, + ext_diff * sizeof(xfs_bmbt_rec_t)); + } +} + +/* + * This removes ext_diff extents from a linear (direct) extent list, + * beginning at extent index idx. If the extents are being removed + * from the end of the list (ie. truncate) then we just need to re- + * allocate the list to remove the extra space. Otherwise, if the + * extents are being removed from the middle of the existing extent + * entries, then we first need to move the extent records beginning + * at idx + ext_diff up in the list to overwrite the records being + * removed, then remove the extra space via kmem_realloc. + */ +void +xfs_iext_remove_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing exts */ + int ext_diff) /* number of extents to remove */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int new_size; /* size of extents after removal */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + new_size = ifp->if_bytes - + (ext_diff * sizeof(xfs_bmbt_rec_t)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (new_size == 0) { + xfs_iext_destroy(ifp); + return; + } + /* Move extents up in the list (if needed) */ + if (idx + ext_diff < nextents) { + memmove(&ifp->if_u1.if_extents[idx], + &ifp->if_u1.if_extents[idx + ext_diff], + (nextents - (idx + ext_diff)) * + sizeof(xfs_bmbt_rec_t)); + } + memset(&ifp->if_u1.if_extents[nextents - ext_diff], + 0, ext_diff * sizeof(xfs_bmbt_rec_t)); + /* + * Reallocate the direct extent list. If the extents + * will fit inside the inode then xfs_iext_realloc_direct + * will switch from direct to inline extent allocation + * mode for us. + */ + xfs_iext_realloc_direct(ifp, new_size); + ifp->if_bytes = new_size; +} + +/* + * This is called when incore extents are being removed from the + * indirection array and the extents being removed span multiple extent + * buffers. The idx parameter contains the file extent index where we + * want to begin removing extents, and the count parameter contains + * how many extents need to be removed. + * + * |-------| |-------| + * | nex1 | | | nex1 - number of extents before idx + * |-------| | count | + * | | | | count - number of extents being removed at idx + * | count | |-------| + * | | | nex2 | nex2 - number of extents after idx + count + * |-------| |-------| + */ +void +xfs_iext_remove_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t idx, /* index to begin removing extents */ + int count) /* number of extents to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int erp_idx = 0; /* indirection array index */ + xfs_extnum_t ext_cnt; /* extents left to remove */ + xfs_extnum_t ext_diff; /* extents to remove in current list */ + xfs_extnum_t nex1; /* number of extents before idx */ + xfs_extnum_t nex2; /* extents after idx + count */ + int page_idx = idx; /* index in target extent list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0); + ASSERT(erp != NULL); + nex1 = page_idx; + ext_cnt = count; + while (ext_cnt) { + nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0); + ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1)); + /* + * Check for deletion of entire list; + * xfs_iext_irec_remove() updates extent offsets. + */ + if (ext_diff == erp->er_extcount) { + xfs_iext_irec_remove(ifp, erp_idx); + ext_cnt -= ext_diff; + nex1 = 0; + if (ext_cnt) { + ASSERT(erp_idx < ifp->if_real_bytes / + XFS_IEXT_BUFSZ); + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + nex1 = 0; + continue; + } else { + break; + } + } + /* Move extents up (if needed) */ + if (nex2) { + memmove(&erp->er_extbuf[nex1], + &erp->er_extbuf[nex1 + ext_diff], + nex2 * sizeof(xfs_bmbt_rec_t)); + } + /* Zero out rest of page */ + memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ - + ((nex1 + nex2) * sizeof(xfs_bmbt_rec_t)))); + /* Update remaining counters */ + erp->er_extcount -= ext_diff; + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff); + ext_cnt -= ext_diff; + nex1 = 0; + erp_idx++; + erp++; + } + ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t); + xfs_iext_irec_compact(ifp); +} + +/* + * Create, destroy, or resize a linear (direct) block of extents. + */ +void +xfs_iext_realloc_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new size of extents after adding */ +{ + int rnew_size; /* real new size of extents */ + + rnew_size = new_size; + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) || + ((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) && + (new_size != ifp->if_real_bytes))); + + /* Free extent records */ + if (new_size == 0) { + xfs_iext_destroy(ifp); + } + /* Resize direct extent list and zero any new bytes */ + else if (ifp->if_real_bytes) { + /* Check if extents will fit inside the inode */ + if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) { + xfs_iext_direct_to_inline(ifp, new_size / + (uint)sizeof(xfs_bmbt_rec_t)); + ifp->if_bytes = new_size; + return; + } + if (!is_power_of_2(new_size)){ + rnew_size = roundup_pow_of_two(new_size); + } + if (rnew_size != ifp->if_real_bytes) { + ifp->if_u1.if_extents = + kmem_realloc(ifp->if_u1.if_extents, + rnew_size, + ifp->if_real_bytes, KM_NOFS); + } + if (rnew_size > ifp->if_real_bytes) { + memset(&ifp->if_u1.if_extents[ifp->if_bytes / + (uint)sizeof(xfs_bmbt_rec_t)], 0, + rnew_size - ifp->if_real_bytes); + } + } + /* Switch from the inline extent buffer to a direct extent list */ + else { + if (!is_power_of_2(new_size)) { + rnew_size = roundup_pow_of_two(new_size); + } + xfs_iext_inline_to_direct(ifp, rnew_size); + } + ifp->if_real_bytes = rnew_size; + ifp->if_bytes = new_size; +} + +/* + * Switch from linear (direct) extent records to inline buffer. + */ +void +xfs_iext_direct_to_inline( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t nextents) /* number of extents in file */ +{ + ASSERT(ifp->if_flags & XFS_IFEXTENTS); + ASSERT(nextents <= XFS_INLINE_EXTS); + /* + * The inline buffer was zeroed when we switched + * from inline to direct extent allocation mode, + * so we don't need to clear it here. + */ + memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents, + nextents * sizeof(xfs_bmbt_rec_t)); + kmem_free(ifp->if_u1.if_extents); + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; + ifp->if_real_bytes = 0; +} + +/* + * Switch from inline buffer to linear (direct) extent records. + * new_size should already be rounded up to the next power of 2 + * by the caller (when appropriate), so use new_size as it is. + * However, since new_size may be rounded up, we can't update + * if_bytes here. It is the caller's responsibility to update + * if_bytes upon return. + */ +void +xfs_iext_inline_to_direct( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* number of extents in file */ +{ + ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS); + memset(ifp->if_u1.if_extents, 0, new_size); + if (ifp->if_bytes) { + memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext, + ifp->if_bytes); + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_real_bytes = new_size; +} + +/* + * Resize an extent indirection array to new_size bytes. + */ +STATIC void +xfs_iext_realloc_indirect( + xfs_ifork_t *ifp, /* inode fork pointer */ + int new_size) /* new indirection array size */ +{ + int nlists; /* number of irec's (ex lists) */ + int size; /* current indirection array size */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + size = nlists * sizeof(xfs_ext_irec_t); + ASSERT(ifp->if_real_bytes); + ASSERT((new_size >= 0) && (new_size != size)); + if (new_size == 0) { + xfs_iext_destroy(ifp); + } else { + ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *) + kmem_realloc(ifp->if_u1.if_ext_irec, + new_size, size, KM_NOFS); + } +} + +/* + * Switch from indirection array to linear (direct) extent allocations. + */ +STATIC void +xfs_iext_indirect_to_direct( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + int size; /* size of file extents */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + size = nextents * sizeof(xfs_bmbt_rec_t); + + xfs_iext_irec_compact_pages(ifp); + ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); + + ep = ifp->if_u1.if_ext_irec->er_extbuf; + kmem_free(ifp->if_u1.if_ext_irec); + ifp->if_flags &= ~XFS_IFEXTIREC; + ifp->if_u1.if_extents = ep; + ifp->if_bytes = size; + if (nextents < XFS_LINEAR_EXTS) { + xfs_iext_realloc_direct(ifp, size); + } +} + +/* + * Free incore file extents. + */ +void +xfs_iext_destroy( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + if (ifp->if_flags & XFS_IFEXTIREC) { + int erp_idx; + int nlists; + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) { + xfs_iext_irec_remove(ifp, erp_idx); + } + ifp->if_flags &= ~XFS_IFEXTIREC; + } else if (ifp->if_real_bytes) { + kmem_free(ifp->if_u1.if_extents); + } else if (ifp->if_bytes) { + memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS * + sizeof(xfs_bmbt_rec_t)); + } + ifp->if_u1.if_extents = NULL; + ifp->if_real_bytes = 0; + ifp->if_bytes = 0; +} + +/* + * Return a pointer to the extent record for file system block bno. + */ +xfs_bmbt_rec_host_t * /* pointer to found extent record */ +xfs_iext_bno_to_ext( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + xfs_extnum_t *idxp) /* index of target extent */ +{ + xfs_bmbt_rec_host_t *base; /* pointer to first extent */ + xfs_filblks_t blockcount = 0; /* number of blocks in extent */ + xfs_bmbt_rec_host_t *ep = NULL; /* pointer to target extent */ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + int high; /* upper boundary in search */ + xfs_extnum_t idx = 0; /* index of target extent */ + int low; /* lower boundary in search */ + xfs_extnum_t nextents; /* number of file extents */ + xfs_fileoff_t startoff = 0; /* start offset of extent */ + + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + if (nextents == 0) { + *idxp = 0; + return NULL; + } + low = 0; + if (ifp->if_flags & XFS_IFEXTIREC) { + /* Find target extent list */ + int erp_idx = 0; + erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx); + base = erp->er_extbuf; + high = erp->er_extcount - 1; + } else { + base = ifp->if_u1.if_extents; + high = nextents - 1; + } + /* Binary search extent records */ + while (low <= high) { + idx = (low + high) >> 1; + ep = base + idx; + startoff = xfs_bmbt_get_startoff(ep); + blockcount = xfs_bmbt_get_blockcount(ep); + if (bno < startoff) { + high = idx - 1; + } else if (bno >= startoff + blockcount) { + low = idx + 1; + } else { + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + *idxp = idx; + return ep; + } + } + /* Convert back to file-based extent index */ + if (ifp->if_flags & XFS_IFEXTIREC) { + idx += erp->er_extoff; + } + if (bno >= startoff + blockcount) { + if (++idx == nextents) { + ep = NULL; + } else { + ep = xfs_iext_get_ext(ifp, idx); + } + } + *idxp = idx; + return ep; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record for filesystem block bno. Store the index of the + * target irec in *erp_idxp. + */ +xfs_ext_irec_t * /* pointer to found extent record */ +xfs_iext_bno_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_fileoff_t bno, /* block number to search for */ + int *erp_idxp) /* irec index of target ext list */ +{ + xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ + xfs_ext_irec_t *erp_next; /* next indirection array entry */ + int erp_idx; /* indirection array index */ + int nlists; /* number of extent irec's (lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL; + if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) { + high = erp_idx - 1; + } else if (erp_next && bno >= + xfs_bmbt_get_startoff(erp_next->er_extbuf)) { + low = erp_idx + 1; + } else { + break; + } + } + *erp_idxp = erp_idx; + return erp; +} + +/* + * Return a pointer to the indirection array entry containing the + * extent record at file extent index *idxp. Store the index of the + * target irec in *erp_idxp and store the page index of the target + * extent record in *idxp. + */ +xfs_ext_irec_t * +xfs_iext_idx_to_irec( + xfs_ifork_t *ifp, /* inode fork pointer */ + xfs_extnum_t *idxp, /* extent index (file -> page) */ + int *erp_idxp, /* pointer to target irec */ + int realloc) /* new bytes were just added */ +{ + xfs_ext_irec_t *prev; /* pointer to previous irec */ + xfs_ext_irec_t *erp = NULL; /* pointer to current irec */ + int erp_idx; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + int high; /* binary search upper limit */ + int low; /* binary search lower limit */ + xfs_extnum_t page_idx = *idxp; /* extent index in target list */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + ASSERT(page_idx >= 0); + ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp_idx = 0; + low = 0; + high = nlists - 1; + + /* Binary search extent irec's */ + while (low <= high) { + erp_idx = (low + high) >> 1; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + prev = erp_idx > 0 ? erp - 1 : NULL; + if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff && + realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) { + high = erp_idx - 1; + } else if (page_idx > erp->er_extoff + erp->er_extcount || + (page_idx == erp->er_extoff + erp->er_extcount && + !realloc)) { + low = erp_idx + 1; + } else if (page_idx == erp->er_extoff + erp->er_extcount && + erp->er_extcount == XFS_LINEAR_EXTS) { + ASSERT(realloc); + page_idx = 0; + erp_idx++; + erp = erp_idx < nlists ? erp + 1 : NULL; + break; + } else { + page_idx -= erp->er_extoff; + break; + } + } + *idxp = page_idx; + *erp_idxp = erp_idx; + return(erp); +} + +/* + * Allocate and initialize an indirection array once the space needed + * for incore extents increases above XFS_IEXT_BUFSZ. + */ +void +xfs_iext_irec_init( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + xfs_extnum_t nextents; /* number of extents in file */ + + ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + ASSERT(nextents <= XFS_LINEAR_EXTS); + + erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); + + if (nextents == 0) { + ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + } else if (!ifp->if_real_bytes) { + xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ); + } else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) { + xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ); + } + erp->er_extbuf = ifp->if_u1.if_extents; + erp->er_extcount = nextents; + erp->er_extoff = 0; + + ifp->if_flags |= XFS_IFEXTIREC; + ifp->if_real_bytes = XFS_IEXT_BUFSZ; + ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t); + ifp->if_u1.if_ext_irec = erp; + + return; +} + +/* + * Allocate and initialize a new entry in the indirection array. + */ +xfs_ext_irec_t * +xfs_iext_irec_new( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* index for new irec */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + + /* Resize indirection array */ + xfs_iext_realloc_indirect(ifp, ++nlists * + sizeof(xfs_ext_irec_t)); + /* + * Move records down in the array so the + * new page can use erp_idx. + */ + erp = ifp->if_u1.if_ext_irec; + for (i = nlists - 1; i > erp_idx; i--) { + memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t)); + } + ASSERT(i == erp_idx); + + /* Initialize new extent record */ + erp = ifp->if_u1.if_ext_irec; + erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS); + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; + memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ); + erp[erp_idx].er_extcount = 0; + erp[erp_idx].er_extoff = erp_idx > 0 ? + erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0; + return (&erp[erp_idx]); +} + +/* + * Remove a record from the indirection array. + */ +void +xfs_iext_irec_remove( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx) /* irec index to remove */ +{ + xfs_ext_irec_t *erp; /* indirection array pointer */ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + if (erp->er_extbuf) { + xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, + -erp->er_extcount); + kmem_free(erp->er_extbuf); + } + /* Compact extent records */ + erp = ifp->if_u1.if_ext_irec; + for (i = erp_idx; i < nlists - 1; i++) { + memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t)); + } + /* + * Manually free the last extent record from the indirection + * array. A call to xfs_iext_realloc_indirect() with a size + * of zero would result in a call to xfs_iext_destroy() which + * would in turn call this function again, creating a nasty + * infinite loop. + */ + if (--nlists) { + xfs_iext_realloc_indirect(ifp, + nlists * sizeof(xfs_ext_irec_t)); + } else { + kmem_free(ifp->if_u1.if_ext_irec); + } + ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ; +} + +/* + * This is called to clean up large amounts of unused memory allocated + * by the indirection array. Before compacting anything though, verify + * that the indirection array is still needed and switch back to the + * linear extent list (or even the inline buffer) if possible. The + * compaction policy is as follows: + * + * Full Compaction: Extents fit into a single page (or inline buffer) + * Partial Compaction: Extents occupy less than 50% of allocated space + * No Compaction: Extents occupy at least 50% of allocated space + */ +void +xfs_iext_irec_compact( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_extnum_t nextents; /* number of extents in file */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + + if (nextents == 0) { + xfs_iext_destroy(ifp); + } else if (nextents <= XFS_INLINE_EXTS) { + xfs_iext_indirect_to_direct(ifp); + xfs_iext_direct_to_inline(ifp, nextents); + } else if (nextents <= XFS_LINEAR_EXTS) { + xfs_iext_indirect_to_direct(ifp); + } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { + xfs_iext_irec_compact_pages(ifp); + } +} + +/* + * Combine extents from neighboring extent pages. + */ +void +xfs_iext_irec_compact_pages( + xfs_ifork_t *ifp) /* inode fork pointer */ +{ + xfs_ext_irec_t *erp, *erp_next;/* pointers to irec entries */ + int erp_idx = 0; /* indirection array index */ + int nlists; /* number of irec's (ex lists) */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + while (erp_idx < nlists - 1) { + erp = &ifp->if_u1.if_ext_irec[erp_idx]; + erp_next = erp + 1; + if (erp_next->er_extcount <= + (XFS_LINEAR_EXTS - erp->er_extcount)) { + memcpy(&erp->er_extbuf[erp->er_extcount], + erp_next->er_extbuf, erp_next->er_extcount * + sizeof(xfs_bmbt_rec_t)); + erp->er_extcount += erp_next->er_extcount; + /* + * Free page before removing extent record + * so er_extoffs don't get modified in + * xfs_iext_irec_remove. + */ + kmem_free(erp_next->er_extbuf); + erp_next->er_extbuf = NULL; + xfs_iext_irec_remove(ifp, erp_idx + 1); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + } else { + erp_idx++; + } + } +} + +/* + * This is called to update the er_extoff field in the indirection + * array when extents have been added or removed from one of the + * extent lists. erp_idx contains the irec index to begin updating + * at and ext_diff contains the number of extents that were added + * or removed. + */ +void +xfs_iext_irec_update_extoffs( + xfs_ifork_t *ifp, /* inode fork pointer */ + int erp_idx, /* irec index to update */ + int ext_diff) /* number of new extents */ +{ + int i; /* loop counter */ + int nlists; /* number of irec's (ex lists */ + + ASSERT(ifp->if_flags & XFS_IFEXTIREC); + nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; + for (i = erp_idx; i < nlists; i++) { + ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff; + } +} diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h new file mode 100644 index 00000000000..7d3b1ed6dcb --- /dev/null +++ b/fs/xfs/xfs_inode_fork.h @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_INODE_FORK_H__ +#define __XFS_INODE_FORK_H__ + +struct xfs_inode_log_item; +struct xfs_dinode; + +/* + * The following xfs_ext_irec_t struct introduces a second (top) level + * to the in-core extent allocation scheme. These structs are allocated + * in a contiguous block, creating an indirection array where each entry + * (irec) contains a pointer to a buffer of in-core extent records which + * it manages. Each extent buffer is 4k in size, since 4k is the system + * page size on Linux i386 and systems with larger page sizes don't seem + * to gain much, if anything, by using their native page size as the + * extent buffer size. Also, using 4k extent buffers everywhere provides + * a consistent interface for CXFS across different platforms. + * + * There is currently no limit on the number of irec's (extent lists) + * allowed, so heavily fragmented files may require an indirection array + * which spans multiple system pages of memory. The number of extents + * which would require this amount of contiguous memory is very large + * and should not cause problems in the foreseeable future. However, + * if the memory needed for the contiguous array ever becomes a problem, + * it is possible that a third level of indirection may be required. + */ +typedef struct xfs_ext_irec { + xfs_bmbt_rec_host_t *er_extbuf; /* block of extent records */ + xfs_extnum_t er_extoff; /* extent offset in file */ + xfs_extnum_t er_extcount; /* number of extents in page/block */ +} xfs_ext_irec_t; + +/* + * File incore extent information, present for each of data & attr forks. + */ +#define XFS_IEXT_BUFSZ 4096 +#define XFS_LINEAR_EXTS (XFS_IEXT_BUFSZ / (uint)sizeof(xfs_bmbt_rec_t)) +#define XFS_INLINE_EXTS 2 +#define XFS_INLINE_DATA 32 +typedef struct xfs_ifork { + int if_bytes; /* bytes in if_u1 */ + int if_real_bytes; /* bytes allocated in if_u1 */ + struct xfs_btree_block *if_broot; /* file's incore btree root */ + short if_broot_bytes; /* bytes allocated for root */ + unsigned char if_flags; /* per-fork flags */ + union { + xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ + xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ + char *if_data; /* inline file data */ + } if_u1; + union { + xfs_bmbt_rec_host_t if_inline_ext[XFS_INLINE_EXTS]; + /* very small file extents */ + char if_inline_data[XFS_INLINE_DATA]; + /* very small file data */ + xfs_dev_t if_rdev; /* dev number if special */ + uuid_t if_uuid; /* mount point value */ + } if_u2; +} xfs_ifork_t; + +/* + * Per-fork incore inode flags. + */ +#define XFS_IFINLINE 0x01 /* Inline data is read in */ +#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */ +#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */ +#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */ + +/* + * Fork handling. + */ + +#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0) +#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3)) + +#define XFS_IFORK_PTR(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + &(ip)->i_df : \ + (ip)->i_afp) +#define XFS_IFORK_DSIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_IFORK_BOFF(ip) : \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version)) +#define XFS_IFORK_ASIZE(ip) \ + (XFS_IFORK_Q(ip) ? \ + XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \ + XFS_IFORK_BOFF(ip) : \ + 0) +#define XFS_IFORK_SIZE(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + XFS_IFORK_DSIZE(ip) : \ + XFS_IFORK_ASIZE(ip)) +#define XFS_IFORK_FORMAT(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_format : \ + (ip)->i_d.di_aformat) +#define XFS_IFORK_FMT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_format = (n)) : \ + ((ip)->i_d.di_aformat = (n))) +#define XFS_IFORK_NEXTENTS(ip,w) \ + ((w) == XFS_DATA_FORK ? \ + (ip)->i_d.di_nextents : \ + (ip)->i_d.di_anextents) +#define XFS_IFORK_NEXT_SET(ip,w,n) \ + ((w) == XFS_DATA_FORK ? \ + ((ip)->i_d.di_nextents = (n)) : \ + ((ip)->i_d.di_anextents = (n))) +#define XFS_IFORK_MAXEXT(ip, w) \ + (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t)) + +int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); +void xfs_idestroy_fork(struct xfs_inode *, int); +void xfs_idata_realloc(struct xfs_inode *, int, int); +void xfs_iroot_realloc(struct xfs_inode *, int, int); +int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int); +int xfs_iextents_copy(struct xfs_inode *, struct xfs_bmbt_rec *, + int); + +struct xfs_bmbt_rec_host * + xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, + struct xfs_bmbt_irec *, int); +void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_add_indirect_multi(struct xfs_ifork *, int, + xfs_extnum_t, int); +void xfs_iext_remove(struct xfs_inode *, xfs_extnum_t, int, int); +void xfs_iext_remove_inline(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_direct(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_remove_indirect(struct xfs_ifork *, xfs_extnum_t, int); +void xfs_iext_realloc_direct(struct xfs_ifork *, int); +void xfs_iext_direct_to_inline(struct xfs_ifork *, xfs_extnum_t); +void xfs_iext_inline_to_direct(struct xfs_ifork *, int); +void xfs_iext_destroy(struct xfs_ifork *); +struct xfs_bmbt_rec_host * + xfs_iext_bno_to_ext(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_bno_to_irec(struct xfs_ifork *, xfs_fileoff_t, int *); +struct xfs_ext_irec * + xfs_iext_idx_to_irec(struct xfs_ifork *, xfs_extnum_t *, int *, + int); +void xfs_iext_irec_init(struct xfs_ifork *); +struct xfs_ext_irec * + xfs_iext_irec_new(struct xfs_ifork *, int); +void xfs_iext_irec_remove(struct xfs_ifork *, int); +void xfs_iext_irec_compact(struct xfs_ifork *); +void xfs_iext_irec_compact_pages(struct xfs_ifork *); +void xfs_iext_irec_compact_full(struct xfs_ifork *); +void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); + +extern struct kmem_zone *xfs_ifork_zone; + +#endif /* __XFS_INODE_FORK_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index f8e80d8e723..a640137b357 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -17,628 +17,455 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_buf_item.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_trans_priv.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_rw.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_trans_priv.h" +#include "xfs_dinode.h" +#include "xfs_log.h" kmem_zone_t *xfs_ili_zone; /* inode log item zone */ -/* - * This returns the number of iovecs needed to log the given inode item. - * - * We need one iovec for the inode log format structure, one for the - * inode core, and possibly one for the inode data/extents/b-tree root - * and one for the inode attribute data/extents/b-tree root. - */ -STATIC uint -xfs_inode_item_size( - xfs_inode_log_item_t *iip) +static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) { - uint nvecs; - xfs_inode_t *ip; - - ip = iip->ili_inode; - nvecs = 2; + return container_of(lip, struct xfs_inode_log_item, ili_item); +} - /* - * Only log the data/extents/b-tree root if there is something - * left to log. - */ - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; +STATIC void +xfs_inode_item_data_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DEXT) && - (ip->i_d.di_nextents > 0) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_extents != NULL); - nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DEXT; + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + /* worst case, doesn't subtract delalloc extents */ + *nbytes += XFS_IFORK_DSIZE(ip); + *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: - ASSERT(ip->i_df.if_ext_max == - XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t)); - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) && - (ip->i_df.if_broot_bytes > 0)) { - ASSERT(ip->i_df.if_broot != NULL); - nvecs++; - } else { - ASSERT(!(iip->ili_format.ilf_fields & - XFS_ILOG_DBROOT)); -#ifdef XFS_TRANS_DEBUG - if (iip->ili_root_size > 0) { - ASSERT(iip->ili_root_size == - ip->i_df.if_broot_bytes); - ASSERT(memcmp(iip->ili_orig_root, - ip->i_df.if_broot, - iip->ili_root_size) == 0); - } else { - ASSERT(ip->i_df.if_broot_bytes == 0); - } -#endif - iip->ili_format.ilf_fields &= ~XFS_ILOG_DBROOT; + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { + *nbytes += ip->i_df.if_broot_bytes; + *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID); - if ((iip->ili_format.ilf_fields & XFS_ILOG_DDATA) && - (ip->i_df.if_bytes > 0)) { - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_DDATA; + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { + *nbytes += roundup(ip->i_df.if_bytes, 4); + *nvecs += 1; } break; case XFS_DINODE_FMT_DEV: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_UUID); - break; - case XFS_DINODE_FMT_UUID: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEXT | XFS_ILOG_DEV); break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with this file, - * then there cannot be anything more to log. - * Clear all attribute-related log flags. - */ - if (!XFS_IFORK_Q(ip)) { - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); - return nvecs; - } +STATIC void +xfs_inode_item_attr_fork_size( + struct xfs_inode_log_item *iip, + int *nvecs, + int *nbytes) +{ + struct xfs_inode *ip = iip->ili_inode; - /* - * Log any necessary attribute data. - */ switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_AEXT) && - (ip->i_d.di_anextents > 0) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_extents != NULL); - nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_AEXT; + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + /* worst case, doesn't subtract unused space */ + *nbytes += XFS_IFORK_ASIZE(ip); + *nvecs += 1; } break; - case XFS_DINODE_FMT_BTREE: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) && - (ip->i_afp->if_broot_bytes > 0)) { - ASSERT(ip->i_afp->if_broot != NULL); - nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ABROOT; + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { + *nbytes += ip->i_afp->if_broot_bytes; + *nvecs += 1; } break; - case XFS_DINODE_FMT_LOCAL: - iip->ili_format.ilf_fields &= - ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); - if ((iip->ili_format.ilf_fields & XFS_ILOG_ADATA) && - (ip->i_afp->if_bytes > 0)) { - ASSERT(ip->i_afp->if_u1.if_data != NULL); - nvecs++; - } else { - iip->ili_format.ilf_fields &= ~XFS_ILOG_ADATA; + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { + *nbytes += roundup(ip->i_afp->if_bytes, 4); + *nvecs += 1; } break; - default: ASSERT(0); break; } - - return nvecs; } /* - * This is called to fill in the vector of log iovecs for the - * given inode log item. It fills the first item with an inode - * log format structure, the second with the on-disk inode structure, - * and a possible third and/or fourth with the inode data/extents/b-tree - * root and inode attributes data/extents/b-tree root. + * This returns the number of iovecs needed to log the given inode item. + * + * We need one iovec for the inode log format structure, one for the + * inode core, and possibly one for the inode data/extents/b-tree root + * and one for the inode attribute data/extents/b-tree root. */ STATIC void -xfs_inode_item_format( - xfs_inode_log_item_t *iip, - xfs_log_iovec_t *log_vector) +xfs_inode_item_size( + struct xfs_log_item *lip, + int *nvecs, + int *nbytes) { - uint nvecs; - xfs_log_iovec_t *vecp; - xfs_inode_t *ip; - size_t data_bytes; - xfs_bmbt_rec_t *ext_buffer; - int nrecs; - xfs_mount_t *mp; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + + *nvecs += 2; + *nbytes += sizeof(struct xfs_inode_log_format) + + xfs_icdinode_size(ip->i_d.di_version); - ip = iip->ili_inode; - vecp = log_vector; + xfs_inode_item_data_fork_size(iip, nvecs, nbytes); + if (XFS_IFORK_Q(ip)) + xfs_inode_item_attr_fork_size(iip, nvecs, nbytes); +} - vecp->i_addr = (xfs_caddr_t)&iip->ili_format; - vecp->i_len = sizeof(xfs_inode_log_format_t); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT); - vecp++; - nvecs = 1; +STATIC void +xfs_inode_item_format_data_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; - /* - * Clear i_update_core if the timestamps (or any other - * non-transactional modification) need flushing/logging - * and we're about to log them with the rest of the core. - * - * This is the same logic as xfs_iflush() but this code can't - * run at the same time as xfs_iflush because we're in commit - * processing here and so we have the inode lock held in - * exclusive mode. Although it doesn't really matter - * for the timestamps if both routines were to grab the - * timestamps or not. That would be ok. - * - * We clear i_update_core before copying out the data. - * This is for coordination with our timestamp updates - * that don't hold the inode lock. They will always - * update the timestamps BEFORE setting i_update_core, - * so if we clear i_update_core after they set it we - * are guaranteed to see their updates to the timestamps - * either here. Likewise, if they set it after we clear it - * here, we'll see it either on the next commit of this - * inode or the next time the inode gets flushed via - * xfs_iflush(). This depends on strongly ordered memory - * semantics, but we have that. We use the SYNCHRONIZE - * macro to make sure that the compiler does not reorder - * the i_update_core access below the data copy below. - */ - if (ip->i_update_core) { - ip->i_update_core = 0; - SYNCHRONIZE(); - } + switch (ip->i_d.di_format) { + case XFS_DINODE_FMT_EXTENTS: + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); - /* - * We don't have to worry about re-ordering here because - * the update_size field is protected by the inode lock - * and we have that held in exclusive mode. - */ - if (ip->i_update_size) - ip->i_update_size = 0; + if ((iip->ili_fields & XFS_ILOG_DEXT) && + ip->i_d.di_nextents > 0 && + ip->i_df.if_bytes > 0) { + struct xfs_bmbt_rec *p; - /* - * Make sure to get the latest atime from the Linux inode. - */ - xfs_synchronize_atime(ip); + ASSERT(ip->i_df.if_u1.if_extents != NULL); + ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); - vecp->i_addr = (xfs_caddr_t)&ip->i_d; - vecp->i_len = sizeof(xfs_dinode_core_t); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE); - vecp++; - nvecs++; - iip->ili_format.ilf_fields |= XFS_ILOG_CORE; + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); - /* - * If this is really an old format inode, then we need to - * log it as such. This means that we have to copy the link - * count from the new field to the old. We don't have to worry - * about the new fields, because nothing trusts them as long as - * the old inode version number is there. If the superblock already - * has a new version number, then we don't bother converting back. - */ - mp = ip->i_mount; - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 || - XFS_SB_VERSION_HASNLINK(&mp->m_sb)); - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) { - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - /* - * Convert it back. - */ - ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1); - ip->i_d.di_onlink = ip->i_d.di_nlink; - } else { - /* - * The superblock version has already been bumped, - * so just make the conversion to the new inode - * format permanent. - */ - ip->i_d.di_version = XFS_DINODE_VERSION_2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - } - } + ASSERT(data_bytes <= ip->i_df.if_bytes); - switch (ip->i_d.di_format) { - case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DBROOT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEXT) { - ASSERT(ip->i_df.if_bytes > 0); - ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_extents_buf == NULL); - nrecs = ip->i_df.if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); - ASSERT(nrecs > 0); -#ifdef XFS_NATIVE_HOST - if (nrecs == ip->i_d.di_nextents) { - /* - * There are no delayed allocation - * extents, so just point to the - * real extents array. - */ - vecp->i_addr = - (char *)(ip->i_df.if_u1.if_extents); - vecp->i_len = ip->i_df.if_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); - } else -#endif - { - /* - * There are delayed allocation extents - * in the inode, or we need to convert - * the extents to on disk format. - * Use xfs_iextents_copy() - * to copy only the real extents into - * a separate buffer. We'll free the - * buffer in the unlock routine. - */ - ext_buffer = kmem_alloc(ip->i_df.if_bytes, - KM_SLEEP); - iip->ili_extents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, - XFS_DATA_FORK); - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT); - } - ASSERT(vecp->i_len <= ip->i_df.if_bytes); - iip->ili_format.ilf_dsize = vecp->i_len; - vecp++; - nvecs++; + ilf->ilf_dsize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DEXT; } break; - case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DDATA | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DBROOT) { - ASSERT(ip->i_df.if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + + if ((iip->ili_fields & XFS_ILOG_DBROOT) && + ip->i_df.if_broot_bytes > 0) { ASSERT(ip->i_df.if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot; - vecp->i_len = ip->i_df.if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT); - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes; + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT, + ip->i_df.if_broot, + ip->i_df.if_broot_bytes); + ilf->ilf_dsize = ip->i_df.if_broot_bytes; + ilf->ilf_size++; + } else { + ASSERT(!(iip->ili_fields & + XFS_ILOG_DBROOT)); + iip->ili_fields &= ~XFS_ILOG_DBROOT; } break; - case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DEV | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DDATA) { - ASSERT(ip->i_df.if_bytes > 0); - ASSERT(ip->i_df.if_u1.if_data != NULL); - ASSERT(ip->i_d.di_size > 0); - - vecp->i_addr = (xfs_caddr_t)ip->i_df.if_u1.if_data; + iip->ili_fields &= + ~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | + XFS_ILOG_DEV | XFS_ILOG_UUID); + if ((iip->ili_fields & XFS_ILOG_DDATA) && + ip->i_df.if_bytes > 0) { /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_df.if_bytes, 4); - ASSERT((ip->i_df.if_real_bytes == 0) || - (ip->i_df.if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL); - vecp++; - nvecs++; - iip->ili_format.ilf_dsize = (unsigned)data_bytes; + ASSERT(ip->i_df.if_real_bytes == 0 || + ip->i_df.if_real_bytes == data_bytes); + ASSERT(ip->i_df.if_u1.if_data != NULL); + ASSERT(ip->i_d.di_size > 0); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL, + ip->i_df.if_u1.if_data, data_bytes); + ilf->ilf_dsize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_DDATA; } break; - case XFS_DINODE_FMT_DEV: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_UUID))); - if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) { - iip->ili_format.ilf_u.ilfu_rdev = - ip->i_df.if_u2.if_rdev; - } + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_UUID); + if (iip->ili_fields & XFS_ILOG_DEV) + ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev; break; - case XFS_DINODE_FMT_UUID: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_DBROOT | XFS_ILOG_DEXT | - XFS_ILOG_DDATA | XFS_ILOG_DEV))); - if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) { - iip->ili_format.ilf_u.ilfu_uuid = - ip->i_df.if_u2.if_uuid; - } + iip->ili_fields &= + ~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | + XFS_ILOG_DEXT | XFS_ILOG_DEV); + if (iip->ili_fields & XFS_ILOG_UUID) + ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid; break; - default: ASSERT(0); break; } +} - /* - * If there are no attributes associated with the file, - * then we're done. - * Assert that no attribute-related log flags are set. - */ - if (!XFS_IFORK_Q(ip)) { - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); - iip->ili_format.ilf_size = nvecs; - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - return; - } +STATIC void +xfs_inode_item_format_attr_fork( + struct xfs_inode_log_item *iip, + struct xfs_inode_log_format *ilf, + struct xfs_log_vec *lv, + struct xfs_log_iovec **vecp) +{ + struct xfs_inode *ip = iip->ili_inode; + size_t data_bytes; switch (ip->i_d.di_aformat) { case XFS_DINODE_FMT_EXTENTS: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_ABROOT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_AEXT) { - ASSERT(ip->i_afp->if_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT); + + if ((iip->ili_fields & XFS_ILOG_AEXT) && + ip->i_d.di_anextents > 0 && + ip->i_afp->if_bytes > 0) { + struct xfs_bmbt_rec *p; + + ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); - ASSERT(ip->i_d.di_anextents > 0); -#ifdef DEBUG - nrecs = ip->i_afp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t); -#endif - ASSERT(nrecs > 0); - ASSERT(nrecs == ip->i_d.di_anextents); -#ifdef XFS_NATIVE_HOST - /* - * There are not delayed allocation extents - * for attributes, so just point at the array. - */ - vecp->i_addr = (char *)(ip->i_afp->if_u1.if_extents); - vecp->i_len = ip->i_afp->if_bytes; -#else - ASSERT(iip->ili_aextents_buf == NULL); - /* - * Need to endian flip before logging - */ - ext_buffer = kmem_alloc(ip->i_afp->if_bytes, - KM_SLEEP); - iip->ili_aextents_buf = ext_buffer; - vecp->i_addr = (xfs_caddr_t)ext_buffer; - vecp->i_len = xfs_iextents_copy(ip, ext_buffer, - XFS_ATTR_FORK); -#endif - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT); - iip->ili_format.ilf_asize = vecp->i_len; - vecp++; - nvecs++; + + p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT); + data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK); + xlog_finish_iovec(lv, *vecp, data_bytes); + + ilf->ilf_asize = data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_AEXT; } break; - case XFS_DINODE_FMT_BTREE: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ADATA | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ABROOT) { - ASSERT(ip->i_afp->if_broot_bytes > 0); + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_AEXT); + + if ((iip->ili_fields & XFS_ILOG_ABROOT) && + ip->i_afp->if_broot_bytes > 0) { ASSERT(ip->i_afp->if_broot != NULL); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot; - vecp->i_len = ip->i_afp->if_broot_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT); - vecp++; - nvecs++; - iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes; + + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT, + ip->i_afp->if_broot, + ip->i_afp->if_broot_bytes); + ilf->ilf_asize = ip->i_afp->if_broot_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ABROOT; } break; - case XFS_DINODE_FMT_LOCAL: - ASSERT(!(iip->ili_format.ilf_fields & - (XFS_ILOG_ABROOT | XFS_ILOG_AEXT))); - if (iip->ili_format.ilf_fields & XFS_ILOG_ADATA) { - ASSERT(ip->i_afp->if_bytes > 0); - ASSERT(ip->i_afp->if_u1.if_data != NULL); + iip->ili_fields &= + ~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT); - vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_u1.if_data; + if ((iip->ili_fields & XFS_ILOG_ADATA) && + ip->i_afp->if_bytes > 0) { /* * Round i_bytes up to a word boundary. * The underlying memory is guaranteed to * to be there by xfs_idata_realloc(). */ data_bytes = roundup(ip->i_afp->if_bytes, 4); - ASSERT((ip->i_afp->if_real_bytes == 0) || - (ip->i_afp->if_real_bytes == data_bytes)); - vecp->i_len = (int)data_bytes; - XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL); - vecp++; - nvecs++; - iip->ili_format.ilf_asize = (unsigned)data_bytes; + ASSERT(ip->i_afp->if_real_bytes == 0 || + ip->i_afp->if_real_bytes == data_bytes); + ASSERT(ip->i_afp->if_u1.if_data != NULL); + xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL, + ip->i_afp->if_u1.if_data, + data_bytes); + ilf->ilf_asize = (unsigned)data_bytes; + ilf->ilf_size++; + } else { + iip->ili_fields &= ~XFS_ILOG_ADATA; } break; - default: ASSERT(0); break; } - - ASSERT(nvecs == iip->ili_item.li_desc->lid_size); - iip->ili_format.ilf_size = nvecs; } +/* + * This is called to fill in the vector of log iovecs for the given inode + * log item. It fills the first item with an inode log format structure, + * the second with the on-disk inode structure, and a possible third and/or + * fourth with the inode data/extents/b-tree root and inode attributes + * data/extents/b-tree root. + */ +STATIC void +xfs_inode_item_format( + struct xfs_log_item *lip, + struct xfs_log_vec *lv) +{ + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_inode_log_format *ilf; + struct xfs_log_iovec *vecp = NULL; + + ASSERT(ip->i_d.di_version > 1); + + ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT); + ilf->ilf_type = XFS_LI_INODE; + ilf->ilf_ino = ip->i_ino; + ilf->ilf_blkno = ip->i_imap.im_blkno; + ilf->ilf_len = ip->i_imap.im_len; + ilf->ilf_boffset = ip->i_imap.im_boffset; + ilf->ilf_fields = XFS_ILOG_CORE; + ilf->ilf_size = 2; /* format + core */ + xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format)); + + xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE, + &ip->i_d, + xfs_icdinode_size(ip->i_d.di_version)); + + xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp); + if (XFS_IFORK_Q(ip)) { + xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp); + } else { + iip->ili_fields &= + ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT); + } + + /* update the format with the exact fields we actually logged */ + ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP); +} /* * This is called to pin the inode associated with the inode log - * item in memory so it cannot be written out. Do this by calling - * xfs_ipin() to bump the pin count in the inode while holding the - * inode pin lock. + * item in memory so it cannot be written out. */ STATIC void xfs_inode_item_pin( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); - xfs_ipin(iip->ili_inode); + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + trace_xfs_inode_pin(ip, _RET_IP_); + atomic_inc(&ip->i_pincount); } /* * This is called to unpin the inode associated with the inode log * item which was previously pinned with a call to xfs_inode_item_pin(). - * Just call xfs_iunpin() on the inode to do this. + * + * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0. */ -/* ARGSUSED */ STATIC void xfs_inode_item_unpin( - xfs_inode_log_item_t *iip, - int stale) + struct xfs_log_item *lip, + int remove) { - xfs_iunpin(iip->ili_inode); -} + struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode; -/* ARGSUSED */ -STATIC void -xfs_inode_item_unpin_remove( - xfs_inode_log_item_t *iip, - xfs_trans_t *tp) -{ - xfs_iunpin(iip->ili_inode); + trace_xfs_inode_unpin(ip, _RET_IP_); + ASSERT(atomic_read(&ip->i_pincount) > 0); + if (atomic_dec_and_test(&ip->i_pincount)) + wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } -/* - * This is called to attempt to lock the inode associated with this - * inode log item, in preparation for the push routine which does the actual - * iflush. Don't sleep on the inode lock or the flush lock. - * - * If the flush lock is already held, indicating that the inode has - * been or is in the process of being flushed, then (ideally) we'd like to - * see if the inode's buffer is still incore, and if so give it a nudge. - * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffer cache. - * Also we don't want to sleep in any device strategy routines, which can happen - * if we do the subsequent bawrite in here. - */ STATIC uint -xfs_inode_item_trylock( - xfs_inode_log_item_t *iip) +xfs_inode_item_push( + struct xfs_log_item *lip, + struct list_head *buffer_list) { - register xfs_inode_t *ip; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + struct xfs_buf *bp = NULL; + uint rval = XFS_ITEM_SUCCESS; + int error; - ip = iip->ili_inode; + if (xfs_ipincount(ip) > 0) + return XFS_ITEM_PINNED; + + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) + return XFS_ITEM_LOCKED; + /* + * Re-check the pincount now that we stabilized the value by + * taking the ilock. + */ if (xfs_ipincount(ip) > 0) { - return XFS_ITEM_PINNED; + rval = XFS_ITEM_PINNED; + goto out_unlock; } - if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - return XFS_ITEM_LOCKED; + /* + * Stale inode items should force out the iclog. + */ + if (ip->i_flags & XFS_ISTALE) { + rval = XFS_ITEM_PINNED; + goto out_unlock; } + /* + * Someone else is already flushing the inode. Nothing we can do + * here but wait for the flush to finish and remove the item from + * the AIL. + */ if (!xfs_iflock_nowait(ip)) { - /* - * If someone else isn't already trying to push the inode - * buffer, we get to do it. - */ - if (iip->ili_pushbuf_flag == 0) { - iip->ili_pushbuf_flag = 1; -#ifdef DEBUG - iip->ili_push_owner = current_pid(); -#endif - /* - * Inode is left locked in shared mode. - * Pushbuf routine gets to unlock it. - */ - return XFS_ITEM_PUSHBUF; - } else { - /* - * We hold the AIL_LOCK, so we must specify the - * NONOTIFY flag so that we won't double trip. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); - return XFS_ITEM_FLUSHING; - } - /* NOTREACHED */ + rval = XFS_ITEM_FLUSHING; + goto out_unlock; } - /* Stale items should force out the iclog */ - if (ip->i_flags & XFS_ISTALE) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); - return XFS_ITEM_PINNED; - } + ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount)); + + spin_unlock(&lip->li_ailp->xa_lock); -#ifdef DEBUG - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - ASSERT(iip->ili_format.ilf_fields != 0); - ASSERT(iip->ili_logged == 0); - ASSERT(iip->ili_item.li_flags & XFS_LI_IN_AIL); + error = xfs_iflush(ip, &bp); + if (!error) { + if (!xfs_buf_delwri_queue(bp, buffer_list)) + rval = XFS_ITEM_FLUSHING; + xfs_buf_relse(bp); } -#endif - return XFS_ITEM_SUCCESS; + + spin_lock(&lip->li_ailp->xa_lock); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return rval; } /* @@ -649,276 +476,82 @@ xfs_inode_item_trylock( */ STATIC void xfs_inode_item_unlock( - xfs_inode_log_item_t *iip) + struct xfs_log_item *lip) { - uint hold; - uint iolocked; - uint lock_flags; - xfs_inode_t *ip; - - ASSERT(iip != NULL); - ASSERT(iip->ili_inode->i_itemp != NULL); - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_EXCL)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); - ASSERT((!(iip->ili_inode->i_itemp->ili_flags & - XFS_ILI_IOLOCKED_SHARED)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); - /* - * Clear the transaction pointer in the inode. - */ - ip = iip->ili_inode; - ip->i_transp = NULL; + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; + unsigned short lock_flags; - /* - * If the inode needed a separate buffer with which to log - * its extents, then free it now. - */ - if (iip->ili_extents_buf != NULL) { - ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_nextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT); - ASSERT(ip->i_df.if_bytes > 0); - kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes); - iip->ili_extents_buf = NULL; - } - if (iip->ili_aextents_buf != NULL) { - ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS); - ASSERT(ip->i_d.di_anextents > 0); - ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT); - ASSERT(ip->i_afp->if_bytes > 0); - kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes); - iip->ili_aextents_buf = NULL; - } + ASSERT(ip->i_itemp != NULL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* - * Figure out if we should unlock the inode or not. - */ - hold = iip->ili_flags & XFS_ILI_HOLD; - - /* - * Before clearing out the flags, remember whether we - * are holding the inode's IO lock. - */ - iolocked = iip->ili_flags & XFS_ILI_IOLOCKED_ANY; - - /* - * Clear out the fields of the inode log item particular - * to the current transaction. - */ - iip->ili_ilock_recur = 0; - iip->ili_iolock_recur = 0; - iip->ili_flags = 0; - - /* - * Unlock the inode if XFS_ILI_HOLD was not set. - */ - if (!hold) { - lock_flags = XFS_ILOCK_EXCL; - if (iolocked & XFS_ILI_IOLOCKED_EXCL) { - lock_flags |= XFS_IOLOCK_EXCL; - } else if (iolocked & XFS_ILI_IOLOCKED_SHARED) { - lock_flags |= XFS_IOLOCK_SHARED; - } - xfs_iput(iip->ili_inode, lock_flags); - } + lock_flags = iip->ili_lock_flags; + iip->ili_lock_flags = 0; + if (lock_flags) + xfs_iunlock(ip, lock_flags); } /* - * This is called to find out where the oldest active copy of the - * inode log item in the on disk log resides now that the last log - * write of it completed at the given lsn. Since we always re-log - * all dirty data in an inode, the latest copy in the on disk log - * is the only one that matters. Therefore, simply return the - * given lsn. + * This is called to find out where the oldest active copy of the inode log + * item in the on disk log resides now that the last log write of it completed + * at the given lsn. Since we always re-log all dirty data in an inode, the + * latest copy in the on disk log is the only one that matters. Therefore, + * simply return the given lsn. + * + * If the inode has been marked stale because the cluster is being freed, we + * don't want to (re-)insert this inode into the AIL. There is a race condition + * where the cluster buffer may be unpinned before the inode is inserted into + * the AIL during transaction committed processing. If the buffer is unpinned + * before the inode item has been committed and inserted, then it is possible + * for the buffer to be written and IO completes before the inode is inserted + * into the AIL. In that case, we'd be inserting a clean, stale inode into the + * AIL which will never get removed. It will, however, get reclaimed which + * triggers an assert in xfs_inode_free() complaining about freein an inode + * still in the AIL. + * + * To avoid this, just unpin the inode directly and return a LSN of -1 so the + * transaction committed code knows that it does not need to do any further + * processing on the item. */ -/*ARGSUSED*/ STATIC xfs_lsn_t xfs_inode_item_committed( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - return (lsn); -} + struct xfs_inode_log_item *iip = INODE_ITEM(lip); + struct xfs_inode *ip = iip->ili_inode; -/* - * The transaction with the inode locked has aborted. The inode - * must not be dirty within the transaction (unless we're forcibly - * shutting down). We simply unlock just as if the transaction - * had been cancelled. - */ -STATIC void -xfs_inode_item_abort( - xfs_inode_log_item_t *iip) -{ - xfs_inode_item_unlock(iip); - return; -} - - -/* - * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK - * failed to get the inode flush lock but did get the inode locked SHARED. - * Here we're trying to see if the inode buffer is incore, and if so whether it's - * marked delayed write. If that's the case, we'll initiate a bawrite on that - * buffer to expedite the process. - * - * We aren't holding the AIL_LOCK (or the flush lock) when this gets called, - * so it is inherently race-y. - */ -STATIC void -xfs_inode_item_pushbuf( - xfs_inode_log_item_t *iip) -{ - xfs_inode_t *ip; - xfs_mount_t *mp; - xfs_buf_t *bp; - uint dopush; - - ip = iip->ili_inode; - - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); - - /* - * The ili_pushbuf_flag keeps others from - * trying to duplicate our effort. - */ - ASSERT(iip->ili_pushbuf_flag != 0); - ASSERT(iip->ili_push_owner == current_pid()); - - /* - * If flushlock isn't locked anymore, chances are that the - * inode flush completed and the inode was taken off the AIL. - * So, just get out. - */ - if (!issemalocked(&(ip->i_flock)) || - ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_inode_item_unpin(lip, 0); + return -1; } - - mp = ip->i_mount; - bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, - iip->ili_format.ilf_len, XFS_INCORE_TRYLOCK); - - if (bp != NULL) { - if (XFS_BUF_ISDELAYWRITE(bp)) { - /* - * We were racing with iflush because we don't hold - * the AIL_LOCK or the flush lock. However, at this point, - * we have the buffer, and we know that it's dirty. - * So, it's possible that iflush raced with us, and - * this item is already taken off the AIL. - * If not, we can flush it async. - */ - dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) && - issemalocked(&(ip->i_flock))); - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_buftrace("INODE ITEM PUSH", bp); - if (XFS_BUF_ISPINNED(bp)) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE); - } - if (dopush) { - xfs_bawrite(mp, bp); - } else { - xfs_buf_relse(bp); - } - } else { - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_buf_relse(bp); - } - return; - } - /* - * We have to be careful about resetting pushbuf flag too early (above). - * Even though in theory we can do it as soon as we have the buflock, - * we don't want others to be doing work needlessly. They'll come to - * this function thinking that pushing the buffer is their - * responsibility only to find that the buffer is still locked by - * another doing the same thing - */ - iip->ili_pushbuf_flag = 0; - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return; -} - - -/* - * This is called to asynchronously write the inode associated with this - * inode log item out to disk. The inode will already have been locked by - * a successful call to xfs_inode_item_trylock(). - */ -STATIC void -xfs_inode_item_push( - xfs_inode_log_item_t *iip) -{ - xfs_inode_t *ip; - - ip = iip->ili_inode; - - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); - ASSERT(issemalocked(&(ip->i_flock))); - /* - * Since we were able to lock the inode's flush lock and - * we found it on the AIL, the inode must be dirty. This - * is because the inode is removed from the AIL while still - * holding the flush lock in xfs_iflush_done(). Thus, if - * we found it in the AIL and were able to obtain the flush - * lock without sleeping, then there must not have been - * anyone in the process of flushing the inode. - */ - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || - iip->ili_format.ilf_fields != 0); - - /* - * Write out the inode. The completion routine ('iflush_done') will - * pull it from the AIL, mark it clean, unlock the flush lock. - */ - (void) xfs_iflush(ip, XFS_IFLUSH_ASYNC); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - return; + return lsn; } /* * XXX rcc - this one really has to do something. Probably needs * to stamp in a new field in the incore inode. */ -/* ARGSUSED */ STATIC void xfs_inode_item_committing( - xfs_inode_log_item_t *iip, + struct xfs_log_item *lip, xfs_lsn_t lsn) { - iip->ili_last_lsn = lsn; - return; + INODE_ITEM(lip)->ili_last_lsn = lsn; } /* * This is the ops vector shared by all buf log items. */ -STATIC struct xfs_item_ops xfs_inode_item_ops = { - .iop_size = (uint(*)(xfs_log_item_t*))xfs_inode_item_size, - .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) - xfs_inode_item_format, - .iop_pin = (void(*)(xfs_log_item_t*))xfs_inode_item_pin, - .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_inode_item_unpin, - .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t*)) - xfs_inode_item_unpin_remove, - .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_inode_item_trylock, - .iop_unlock = (void(*)(xfs_log_item_t*))xfs_inode_item_unlock, - .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committed, - .iop_push = (void(*)(xfs_log_item_t*))xfs_inode_item_push, - .iop_abort = (void(*)(xfs_log_item_t*))xfs_inode_item_abort, - .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_inode_item_pushbuf, - .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) - xfs_inode_item_committing +static const struct xfs_item_ops xfs_inode_item_ops = { + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, + .iop_unpin = xfs_inode_item_unpin, + .iop_unlock = xfs_inode_item_unlock, + .iop_committed = xfs_inode_item_committed, + .iop_push = xfs_inode_item_push, + .iop_committing = xfs_inode_item_committing }; @@ -927,30 +560,17 @@ STATIC struct xfs_item_ops xfs_inode_item_ops = { */ void xfs_inode_item_init( - xfs_inode_t *ip, - xfs_mount_t *mp) + struct xfs_inode *ip, + struct xfs_mount *mp) { - xfs_inode_log_item_t *iip; + struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); iip = ip->i_itemp = kmem_zone_zalloc(xfs_ili_zone, KM_SLEEP); - iip->ili_item.li_type = XFS_LI_INODE; - iip->ili_item.li_ops = &xfs_inode_item_ops; - iip->ili_item.li_mountp = mp; iip->ili_inode = ip; - - /* - We have zeroed memory. No need ... - iip->ili_extents_buf = NULL; - iip->ili_pushbuf_flag = 0; - */ - - iip->ili_format.ilf_type = XFS_LI_INODE; - iip->ili_format.ilf_ino = ip->i_ino; - iip->ili_format.ilf_blkno = ip->i_blkno; - iip->ili_format.ilf_len = ip->i_len; - iip->ili_format.ilf_boffset = ip->i_boffset; + xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE, + &xfs_inode_item_ops); } /* @@ -960,12 +580,6 @@ void xfs_inode_item_destroy( xfs_inode_t *ip) { -#ifdef XFS_TRANS_DEBUG - if (ip->i_itemp->ili_root_size != 0) { - kmem_free(ip->i_itemp->ili_orig_root, - ip->i_itemp->ili_root_size); - } -#endif kmem_zone_free(xfs_ili_zone, ip->i_itemp); } @@ -976,17 +590,64 @@ xfs_inode_item_destroy( * flushed to disk. It is responsible for removing the inode item * from the AIL if it has not been re-logged, and unlocking the inode's * flush lock. + * + * To reduce AIL lock traffic as much as possible, we scan the buffer log item + * list for other inodes that will run this function. We remove them from the + * buffer list so we can process all the inode IO completions in one AIL lock + * traversal. */ -/*ARGSUSED*/ void xfs_iflush_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - xfs_inode_t *ip; - SPLDECL(s); + struct xfs_inode_log_item *iip; + struct xfs_log_item *blip; + struct xfs_log_item *next; + struct xfs_log_item *prev; + struct xfs_ail *ailp = lip->li_ailp; + int need_ail = 0; + + /* + * Scan the buffer IO completions for other inodes being completed and + * attach them to the current inode log item. + */ + blip = bp->b_fspriv; + prev = NULL; + while (blip != NULL) { + if (lip->li_cb != xfs_iflush_done) { + prev = blip; + blip = blip->li_bio_list; + continue; + } + + /* remove from list */ + next = blip->li_bio_list; + if (!prev) { + bp->b_fspriv = next; + } else { + prev->li_bio_list = next; + } + + /* add to current list */ + blip->li_bio_list = lip->li_bio_list; + lip->li_bio_list = blip; - ip = iip->ili_inode; + /* + * while we have the item, do the unlocked check for needing + * the AIL lock. + */ + iip = INODE_ITEM(blip); + if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + need_ail++; + + blip = next; + } + + /* make sure we capture the state of the initial inode. */ + iip = INODE_ITEM(lip); + if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + need_ail++; /* * We only want to pull the item from the AIL if it is @@ -997,65 +658,65 @@ xfs_iflush_done( * the lock since it's cheaper, and then we recheck while * holding the lock before removing the inode from the AIL. */ - if (iip->ili_logged && - (iip->ili_item.li_lsn == iip->ili_flush_lsn)) { - AIL_LOCK(ip->i_mount, s); - if (iip->ili_item.li_lsn == iip->ili_flush_lsn) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(ip->i_mount, - (xfs_log_item_t*)iip, s); - } else { - AIL_UNLOCK(ip->i_mount, s); + if (need_ail) { + struct xfs_log_item *log_items[need_ail]; + int i = 0; + spin_lock(&ailp->xa_lock); + for (blip = lip; blip; blip = blip->li_bio_list) { + iip = INODE_ITEM(blip); + if (iip->ili_logged && + blip->li_lsn == iip->ili_flush_lsn) { + log_items[i++] = blip; + } + ASSERT(i <= need_ail); } + /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ + xfs_trans_ail_delete_bulk(ailp, log_items, i, + SHUTDOWN_CORRUPT_INCORE); } - iip->ili_logged = 0; /* - * Clear the ili_last_fields bits now that we know that the - * data corresponding to them is safely on disk. + * clean up and unlock the flush lock now we are done. We can clear the + * ili_last_fields bits now that we know that the data corresponding to + * them is safely on disk. */ - iip->ili_last_fields = 0; + for (blip = lip; blip; blip = next) { + next = blip->li_bio_list; + blip->li_bio_list = NULL; - /* - * Release the inode's flush lock since we're done with it. - */ - xfs_ifunlock(ip); - - return; + iip = INODE_ITEM(blip); + iip->ili_logged = 0; + iip->ili_last_fields = 0; + xfs_ifunlock(iip->ili_inode); + } } /* - * This is the inode flushing abort routine. It is called - * from xfs_iflush when the filesystem is shutting down to clean - * up the inode state. - * It is responsible for removing the inode item - * from the AIL if it has not been re-logged, and unlocking the inode's - * flush lock. + * This is the inode flushing abort routine. It is called from xfs_iflush when + * the filesystem is shutting down to clean up the inode state. It is + * responsible for removing the inode item from the AIL if it has not been + * re-logged, and unlocking the inode's flush lock. */ void xfs_iflush_abort( - xfs_inode_t *ip) + xfs_inode_t *ip, + bool stale) { - xfs_inode_log_item_t *iip; - xfs_mount_t *mp; - SPLDECL(s); + xfs_inode_log_item_t *iip = ip->i_itemp; - iip = ip->i_itemp; - mp = ip->i_mount; if (iip) { + struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - AIL_LOCK(mp, s); + spin_lock(&ailp->xa_lock); if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { - /* - * xfs_trans_delete_ail() drops the AIL lock. - */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip, - s); + /* xfs_trans_ail_delete() drops the AIL lock. */ + xfs_trans_ail_delete(ailp, &iip->ili_item, + stale ? + SHUTDOWN_LOG_IO_ERROR : + SHUTDOWN_CORRUPT_INCORE); } else - AIL_UNLOCK(mp, s); + spin_unlock(&ailp->xa_lock); } iip->ili_logged = 0; /* @@ -1067,7 +728,7 @@ xfs_iflush_abort( * Clear the inode logging fields so no more flushes are * attempted. */ - iip->ili_format.ilf_fields = 0; + iip->ili_fields = 0; } /* * Release the inode's flush lock since we're done with it. @@ -1077,10 +738,10 @@ xfs_iflush_abort( void xfs_istale_done( - xfs_buf_t *bp, - xfs_inode_log_item_t *iip) + struct xfs_buf *bp, + struct xfs_log_item *lip) { - xfs_iflush_abort(iip->ili_inode); + xfs_iflush_abort(INODE_ITEM(lip)->ili_inode, true); } /* @@ -1093,9 +754,8 @@ xfs_inode_item_format_convert( xfs_inode_log_format_t *in_f) { if (buf->i_len == sizeof(xfs_inode_log_format_32_t)) { - xfs_inode_log_format_32_t *in_f32; + xfs_inode_log_format_32_t *in_f32 = buf->i_addr; - in_f32 = (xfs_inode_log_format_32_t *)buf->i_addr; in_f->ilf_type = in_f32->ilf_type; in_f->ilf_size = in_f32->ilf_size; in_f->ilf_fields = in_f32->ilf_fields; @@ -1111,9 +771,8 @@ xfs_inode_item_format_convert( in_f->ilf_boffset = in_f32->ilf_boffset; return 0; } else if (buf->i_len == sizeof(xfs_inode_log_format_64_t)){ - xfs_inode_log_format_64_t *in_f64; + xfs_inode_log_format_64_t *in_f64 = buf->i_addr; - in_f64 = (xfs_inode_log_format_64_t *)buf->i_addr; in_f->ilf_type = in_f64->ilf_type; in_f->ilf_size = in_f64->ilf_size; in_f->ilf_fields = in_f64->ilf_fields; diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 5db6cd1b4cf..488d81254e2 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -18,164 +18,37 @@ #ifndef __XFS_INODE_ITEM_H__ #define __XFS_INODE_ITEM_H__ -/* - * This is the structure used to lay out an inode log item in the - * log. The size of the inline data/extents/b-tree root to be logged - * (if any) is indicated in the ilf_dsize field. Changes to this structure - * must be added on to the end. - */ -typedef struct xfs_inode_log_format { - unsigned short ilf_type; /* inode log item type */ - unsigned short ilf_size; /* size of this item */ - uint ilf_fields; /* flags for fields logged */ - ushort ilf_asize; /* size of attr d/ext/root */ - ushort ilf_dsize; /* size of data/ext/root */ - xfs_ino_t ilf_ino; /* inode number */ - union { - xfs_dev_t ilfu_rdev; /* rdev value for dev inode*/ - uuid_t ilfu_uuid; /* mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* blkno of inode buffer */ - int ilf_len; /* len of inode buffer */ - int ilf_boffset; /* off of inode in buffer */ -} xfs_inode_log_format_t; - -typedef struct xfs_inode_log_format_32 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - xfs_ino_t ilf_ino; /* 64: inode number */ - union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ -} __attribute__((packed)) xfs_inode_log_format_32_t; - -typedef struct xfs_inode_log_format_64 { - unsigned short ilf_type; /* 16: inode log item type */ - unsigned short ilf_size; /* 16: size of this item */ - uint ilf_fields; /* 32: flags for fields logged */ - ushort ilf_asize; /* 32: size of attr d/ext/root */ - ushort ilf_dsize; /* 32: size of data/ext/root */ - __uint32_t ilf_pad; /* 32: pad for 64 bit boundary */ - xfs_ino_t ilf_ino; /* 64: inode number */ - union { - xfs_dev_t ilfu_rdev; /* 32: rdev value for dev inode*/ - uuid_t ilfu_uuid; /* 128: mount point value */ - } ilf_u; - __int64_t ilf_blkno; /* 64: blkno of inode buffer */ - int ilf_len; /* 32: len of inode buffer */ - int ilf_boffset; /* 32: off of inode in buffer */ -} xfs_inode_log_format_64_t; - -/* - * Flags for xfs_trans_log_inode flags field. - */ -#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ -#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ -#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ -#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ -#define XFS_ILOG_DEV 0x010 /* log the dev field */ -#define XFS_ILOG_UUID 0x020 /* log the uuid field */ -#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ -#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ -#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ - -#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ - XFS_ILOG_UUID | XFS_ILOG_ADATA | \ - XFS_ILOG_AEXT | XFS_ILOG_ABROOT) - -#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ - XFS_ILOG_DBROOT) - -#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) - -#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ - XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ - XFS_ILOG_DEV | XFS_ILOG_UUID | \ - XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ - XFS_ILOG_ABROOT) - -#define XFS_ILI_HOLD 0x1 -#define XFS_ILI_IOLOCKED_EXCL 0x2 -#define XFS_ILI_IOLOCKED_SHARED 0x4 - -#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED) - - -#ifdef __KERNEL__ +/* kernel only definitions */ struct xfs_buf; -struct xfs_bmbt_rec_64; +struct xfs_bmbt_rec; struct xfs_inode; struct xfs_mount; - typedef struct xfs_inode_log_item { xfs_log_item_t ili_item; /* common portion */ struct xfs_inode *ili_inode; /* inode ptr */ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ - unsigned short ili_ilock_recur; /* lock recursion count */ - unsigned short ili_iolock_recur; /* lock recursion count */ - unsigned short ili_flags; /* misc flags */ + unsigned short ili_lock_flags; /* lock flags */ unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ - struct xfs_bmbt_rec_64 *ili_extents_buf; /* array of logged - data exts */ - struct xfs_bmbt_rec_64 *ili_aextents_buf; /* array of logged - attr exts */ - unsigned int ili_pushbuf_flag; /* one bit used in push_ail */ - -#ifdef DEBUG - uint64_t ili_push_owner; /* one who sets pushbuf_flag - above gets to push the buf */ -#endif -#ifdef XFS_TRANS_DEBUG - int ili_root_size; - char *ili_orig_root; -#endif - xfs_inode_log_format_t ili_format; /* logged structure */ + unsigned int ili_fields; /* fields to be logged */ } xfs_inode_log_item_t; - -#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w) -static inline int xfs_ilog_fdata(int w) +static inline int xfs_inode_clean(xfs_inode_t *ip) { - return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); + return !ip->i_itemp || !(ip->i_itemp->ili_fields & XFS_ILOG_ALL); } -#endif /* __KERNEL__ */ - -#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w) -static inline int xfs_ilog_fbroot(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); -} - -#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w) -static inline int xfs_ilog_fext(int w) -{ - return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); -} - -#ifdef __KERNEL__ - extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); -extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); -extern void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); -extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_istale_done(struct xfs_buf *, struct xfs_log_item *); +extern void xfs_iflush_abort(struct xfs_inode *, bool); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, xfs_inode_log_format_t *); -#endif /* __KERNEL__ */ +extern struct kmem_zone *xfs_ili_zone; #endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h index 7a28191cb0d..90efdaf1706 100644 --- a/fs/xfs/xfs_inum.h +++ b/fs/xfs/xfs_inum.h @@ -26,22 +26,6 @@ * high agno_log-agblklog-inopblog bits - 0 */ -typedef __uint32_t xfs_agino_t; /* within allocation grp inode number */ - -/* - * Useful inode bits for this kernel. - * Used in some places where having 64-bits in the 32-bit kernels - * costs too much. - */ -#if XFS_BIG_INUMS -typedef xfs_ino_t xfs_intino_t; -#else -typedef __uint32_t xfs_intino_t; -#endif - -#define NULLFSINO ((xfs_ino_t)-1) -#define NULLAGINO ((xfs_agino_t)-1) - struct xfs_mount; #define XFS_INO_MASK(k) (__uint32_t)((1ULL << (k)) - 1) @@ -72,7 +56,6 @@ struct xfs_mount; #if XFS_BIG_INUMS #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) -#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32)) #else #define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) #endif diff --git a/fs/xfs/xfs_iocore.c b/fs/xfs/xfs_iocore.c deleted file mode 100644 index 06d710c9ce4..00000000000 --- a/fs/xfs/xfs_iocore.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dfrag.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_quota.h" -#include "xfs_trans_space.h" -#include "xfs_iomap.h" - - -STATIC xfs_fsize_t -xfs_size_fn( - xfs_inode_t *ip) -{ - return (ip->i_d.di_size); -} - -STATIC int -xfs_ioinit( - struct bhv_vfs *vfsp, - struct xfs_mount_args *mntargs, - int flags) -{ - return xfs_mountfs(vfsp, XFS_VFSTOM(vfsp), flags); -} - -xfs_ioops_t xfs_iocore_xfs = { - .xfs_ioinit = (xfs_ioinit_t) xfs_ioinit, - .xfs_bmapi_func = (xfs_bmapi_t) xfs_bmapi, - .xfs_bunmapi_func = (xfs_bunmapi_t) xfs_bunmapi, - .xfs_bmap_eof_func = (xfs_bmap_eof_t) xfs_bmap_eof, - .xfs_iomap_write_direct = - (xfs_iomap_write_direct_t) xfs_iomap_write_direct, - .xfs_iomap_write_delay = - (xfs_iomap_write_delay_t) xfs_iomap_write_delay, - .xfs_iomap_write_allocate = - (xfs_iomap_write_allocate_t) xfs_iomap_write_allocate, - .xfs_iomap_write_unwritten = - (xfs_iomap_write_unwritten_t) xfs_iomap_write_unwritten, - .xfs_ilock = (xfs_lock_t) xfs_ilock, - .xfs_lck_map_shared = (xfs_lck_map_shared_t) xfs_ilock_map_shared, - .xfs_ilock_demote = (xfs_lock_demote_t) xfs_ilock_demote, - .xfs_ilock_nowait = (xfs_lock_nowait_t) xfs_ilock_nowait, - .xfs_unlock = (xfs_unlk_t) xfs_iunlock, - .xfs_size_func = (xfs_size_t) xfs_size_fn, - .xfs_iodone = (xfs_iodone_t) fs_noerr, - .xfs_swap_extents_func = (xfs_swap_extents_t) xfs_swap_extents, -}; - -void -xfs_iocore_inode_reinit( - xfs_inode_t *ip) -{ - xfs_iocore_t *io = &ip->i_iocore; - - io->io_flags = 0; - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) - io->io_flags |= XFS_IOCORE_RT; - io->io_dmevmask = ip->i_d.di_dmevmask; - io->io_dmstate = ip->i_d.di_dmstate; -} - -void -xfs_iocore_inode_init( - xfs_inode_t *ip) -{ - xfs_iocore_t *io = &ip->i_iocore; - xfs_mount_t *mp = ip->i_mount; - - io->io_mount = mp; -#ifdef DEBUG - io->io_lock = &ip->i_lock; - io->io_iolock = &ip->i_iolock; -#endif - - io->io_obj = (void *)ip; - - xfs_iocore_inode_reinit(ip); -} diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c new file mode 100644 index 00000000000..8bc1bbce745 --- /dev/null +++ b/fs/xfs/xfs_ioctl.c @@ -0,0 +1,1810 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ioctl.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_fsops.h" +#include "xfs_discard.h" +#include "xfs_quota.h" +#include "xfs_export.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_dinode.h" +#include "xfs_trans.h" + +#include <linux/capability.h> +#include <linux/dcache.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/pagemap.h> +#include <linux/slab.h> +#include <linux/exportfs.h> + +/* + * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to + * a file or fs handle. + * + * XFS_IOC_PATH_TO_FSHANDLE + * returns fs handle for a mount point or path within that mount point + * XFS_IOC_FD_TO_HANDLE + * returns full handle for a FD opened in user space + * XFS_IOC_PATH_TO_HANDLE + * returns full handle for a path + */ +int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq) +{ + int hsize; + xfs_handle_t handle; + struct inode *inode; + struct fd f = {NULL}; + struct path path; + int error; + struct xfs_inode *ip; + + if (cmd == XFS_IOC_FD_TO_HANDLE) { + f = fdget(hreq->fd); + if (!f.file) + return -EBADF; + inode = file_inode(f.file); + } else { + error = user_lpath((const char __user *)hreq->path, &path); + if (error) + return error; + inode = path.dentry->d_inode; + } + ip = XFS_I(inode); + + /* + * We can only generate handles for inodes residing on a XFS filesystem, + * and only for regular files, directories or symbolic links. + */ + error = -EINVAL; + if (inode->i_sb->s_magic != XFS_SB_MAGIC) + goto out_put; + + error = -EBADF; + if (!S_ISREG(inode->i_mode) && + !S_ISDIR(inode->i_mode) && + !S_ISLNK(inode->i_mode)) + goto out_put; + + + memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); + + if (cmd == XFS_IOC_PATH_TO_FSHANDLE) { + /* + * This handle only contains an fsid, zero the rest. + */ + memset(&handle.ha_fid, 0, sizeof(handle.ha_fid)); + hsize = sizeof(xfs_fsid_t); + } else { + handle.ha_fid.fid_len = sizeof(xfs_fid_t) - + sizeof(handle.ha_fid.fid_len); + handle.ha_fid.fid_pad = 0; + handle.ha_fid.fid_gen = ip->i_d.di_gen; + handle.ha_fid.fid_ino = ip->i_ino; + + hsize = XFS_HSIZE(handle); + } + + error = -EFAULT; + if (copy_to_user(hreq->ohandle, &handle, hsize) || + copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) + goto out_put; + + error = 0; + + out_put: + if (cmd == XFS_IOC_FD_TO_HANDLE) + fdput(f); + else + path_put(&path); + return error; +} + +/* + * No need to do permission checks on the various pathname components + * as the handle operations are privileged. + */ +STATIC int +xfs_handle_acceptable( + void *context, + struct dentry *dentry) +{ + return 1; +} + +/* + * Convert userspace handle data into a dentry. + */ +struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen) +{ + xfs_handle_t handle; + struct xfs_fid64 fid; + + /* + * Only allow handle opens under a directory. + */ + if (!S_ISDIR(file_inode(parfilp)->i_mode)) + return ERR_PTR(-ENOTDIR); + + if (hlen != sizeof(xfs_handle_t)) + return ERR_PTR(-EINVAL); + if (copy_from_user(&handle, uhandle, hlen)) + return ERR_PTR(-EFAULT); + if (handle.ha_fid.fid_len != + sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len)) + return ERR_PTR(-EINVAL); + + memset(&fid, 0, sizeof(struct fid)); + fid.ino = handle.ha_fid.fid_ino; + fid.gen = handle.ha_fid.fid_gen; + + return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3, + FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG, + xfs_handle_acceptable, NULL); +} + +STATIC struct dentry * +xfs_handlereq_to_dentry( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen); +} + +int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + const struct cred *cred = current_cred(); + int error; + int fd; + int permflag; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + fmode_t fmode; + struct path path; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + inode = dentry->d_inode; + + /* Restrict xfs_open_by_handle to directories & regular files. */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { + error = -XFS_ERROR(EPERM); + goto out_dput; + } + +#if BITS_PER_LONG != 32 + hreq->oflags |= O_LARGEFILE; +#endif + + permflag = hreq->oflags; + fmode = OPEN_FMODE(permflag); + if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && + (fmode & FMODE_WRITE) && IS_APPEND(inode)) { + error = -XFS_ERROR(EPERM); + goto out_dput; + } + + if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { + error = -XFS_ERROR(EACCES); + goto out_dput; + } + + /* Can't write directories. */ + if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { + error = -XFS_ERROR(EISDIR); + goto out_dput; + } + + fd = get_unused_fd_flags(0); + if (fd < 0) { + error = fd; + goto out_dput; + } + + path.mnt = parfilp->f_path.mnt; + path.dentry = dentry; + filp = dentry_open(&path, hreq->oflags, cred); + dput(dentry); + if (IS_ERR(filp)) { + put_unused_fd(fd); + return PTR_ERR(filp); + } + + if (S_ISREG(inode->i_mode)) { + filp->f_flags |= O_NOATIME; + filp->f_mode |= FMODE_NOCMTIME; + } + + fd_install(fd, filp); + return fd; + + out_dput: + dput(dentry); + return error; +} + +int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq) +{ + struct dentry *dentry; + __u32 olen; + void *link; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + dentry = xfs_handlereq_to_dentry(parfilp, hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + /* Restrict this handle operation to symlinks only. */ + if (!S_ISLNK(dentry->d_inode->i_mode)) { + error = -XFS_ERROR(EINVAL); + goto out_dput; + } + + if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { + error = -XFS_ERROR(EFAULT); + goto out_dput; + } + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) { + error = -XFS_ERROR(ENOMEM); + goto out_dput; + } + + error = -xfs_readlink(XFS_I(dentry->d_inode), link); + if (error) + goto out_kfree; + error = readlink_copy(hreq->ohandle, olen, link); + if (error) + goto out_kfree; + + out_kfree: + kfree(link); + out_dput: + dput(dentry); + return error; +} + +int +xfs_set_dmattrs( + xfs_inode_t *ip, + u_int evmask, + u_int16_t state) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_trans_t *tp; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + ip->i_d.di_dmevmask = evmask; + ip->i_d.di_dmstate = state; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + error = xfs_trans_commit(tp, 0); + + return error; +} + +STATIC int +xfs_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(parfilp); + if (error) + return error; + + dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) { + mnt_drop_write_file(parfilp); + return PTR_ERR(dentry); + } + + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + error = -XFS_ERROR(EPERM); + goto out; + } + + if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { + error = -XFS_ERROR(EFAULT); + goto out; + } + + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + + out: + mnt_drop_write_file(parfilp); + dput(dentry); + return error; +} + +STATIC int +xfs_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error = -ENOMEM; + attrlist_cursor_kern_t *cursor; + xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = EFAULT; + + if (*len > XATTR_SIZE_MAX) + return EINVAL; + kbuf = kmem_zalloc_large(*len, KM_SLEEP); + if (!kbuf) + return ENOMEM; + + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); + if (error) + goto out_kfree; + + if (copy_to_user(ubuf, kbuf, *len)) + error = EFAULT; + +out_kfree: + kmem_free(kbuf); + return error; +} + +int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags) +{ + unsigned char *kbuf; + int error = EFAULT; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return EPERM; + if (len > XATTR_SIZE_MAX) + return EINVAL; + + kbuf = memdup_user(ubuf, len); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); + + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + + return error; +} + +int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags) +{ + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return EPERM; + return xfs_attr_remove(XFS_I(inode), name, flags); +} + +STATIC int +xfs_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + xfs_attr_multiop_t *ops; + xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = E2BIG; + size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(am_hreq.ops, size); + if (IS_ERR(ops)) { + error = -PTR_ERR(ops); + goto out_dput; + } + + error = ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + ops[i].am_attrname, MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, &ops[i].am_length, + ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + ops[i].am_attrvalue, ops[i].am_length, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(am_hreq.ops, ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return -error; +} + +int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct iattr iattr; + bool setprealloc = false; + bool clrprealloc = false; + int error; + + /* + * Only allow the sys admin to reserve space unless + * unwritten extents are enabled. + */ + if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && + !capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) + return -XFS_ERROR(EPERM); + + if (!(filp->f_mode & FMODE_WRITE)) + return -XFS_ERROR(EBADF); + + if (!S_ISREG(inode->i_mode)) + return -XFS_ERROR(EINVAL); + + error = mnt_want_write_file(filp); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + switch (bf->l_whence) { + case 0: /*SEEK_SET*/ + break; + case 1: /*SEEK_CUR*/ + bf->l_start += filp->f_pos; + break; + case 2: /*SEEK_END*/ + bf->l_start += XFS_ISIZE(ip); + break; + default: + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + /* + * length of <= 0 for resv/unresv/zero is invalid. length for + * alloc/free is ignored completely and we have no idea what userspace + * might have set it to, so set it to zero to allow range + * checks to pass. + */ + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + if (bf->l_len <= 0) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + break; + default: + bf->l_len = 0; + break; + } + + if (bf->l_start < 0 || + bf->l_start > mp->m_super->s_maxbytes || + bf->l_start + bf->l_len < 0 || + bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { + error = XFS_ERROR(EINVAL); + goto out_unlock; + } + + switch (cmd) { + case XFS_IOC_ZERO_RANGE: + error = xfs_zero_file_space(ip, bf->l_start, bf->l_len); + if (!error) + setprealloc = true; + break; + case XFS_IOC_RESVSP: + case XFS_IOC_RESVSP64: + error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len, + XFS_BMAPI_PREALLOC); + if (!error) + setprealloc = true; + break; + case XFS_IOC_UNRESVSP: + case XFS_IOC_UNRESVSP64: + error = xfs_free_file_space(ip, bf->l_start, bf->l_len); + break; + case XFS_IOC_ALLOCSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP: + case XFS_IOC_FREESP64: + if (bf->l_start > XFS_ISIZE(ip)) { + error = xfs_alloc_file_space(ip, XFS_ISIZE(ip), + bf->l_start - XFS_ISIZE(ip), 0); + if (error) + goto out_unlock; + } + + iattr.ia_valid = ATTR_SIZE; + iattr.ia_size = bf->l_start; + + error = xfs_setattr_size(ip, &iattr); + if (!error) + clrprealloc = true; + break; + default: + ASSERT(0); + error = XFS_ERROR(EINVAL); + } + + if (error) + goto out_unlock; + + tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_unlock; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + if (!(ioflags & IO_INVIS)) { + ip->i_d.di_mode &= ~S_ISUID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + } + + if (setprealloc) + ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; + else if (clrprealloc) + ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + if (filp->f_flags & O_DSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +out_unlock: + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + mnt_drop_write_file(filp); + return -error; +} + +STATIC int +xfs_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + void __user *arg) +{ + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) + return -XFS_ERROR(EFAULT); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS) + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt); + else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) + error = xfs_bulkstat_single(mp, &inlast, + bulkreq.ubuffer, &done); + else /* XFS_IOC_FSBULKSTAT */ + error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, + sizeof(xfs_bstat_t), bulkreq.ubuffer, + &done); + + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_ioc_fsgeometry_v1( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return -error; + + /* + * Caller should have passed an argument of type + * xfs_fsop_geom_v1_t. This is a proper subset of the + * xfs_fsop_geom_t that xfs_fs_geometry() fills in. + */ + if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_ioc_fsgeometry( + xfs_mount_t *mp, + void __user *arg) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 4); + if (error) + return -error; + + if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* + * Linux extended inode flags interface. + */ + +STATIC unsigned int +xfs_merge_ioc_xflags( + unsigned int flags, + unsigned int start) +{ + unsigned int xflags = start; + + if (flags & FS_IMMUTABLE_FL) + xflags |= XFS_XFLAG_IMMUTABLE; + else + xflags &= ~XFS_XFLAG_IMMUTABLE; + if (flags & FS_APPEND_FL) + xflags |= XFS_XFLAG_APPEND; + else + xflags &= ~XFS_XFLAG_APPEND; + if (flags & FS_SYNC_FL) + xflags |= XFS_XFLAG_SYNC; + else + xflags &= ~XFS_XFLAG_SYNC; + if (flags & FS_NOATIME_FL) + xflags |= XFS_XFLAG_NOATIME; + else + xflags &= ~XFS_XFLAG_NOATIME; + if (flags & FS_NODUMP_FL) + xflags |= XFS_XFLAG_NODUMP; + else + xflags &= ~XFS_XFLAG_NODUMP; + + return xflags; +} + +STATIC unsigned int +xfs_di2lxflags( + __uint16_t di_flags) +{ + unsigned int flags = 0; + + if (di_flags & XFS_DIFLAG_IMMUTABLE) + flags |= FS_IMMUTABLE_FL; + if (di_flags & XFS_DIFLAG_APPEND) + flags |= FS_APPEND_FL; + if (di_flags & XFS_DIFLAG_SYNC) + flags |= FS_SYNC_FL; + if (di_flags & XFS_DIFLAG_NOATIME) + flags |= FS_NOATIME_FL; + if (di_flags & XFS_DIFLAG_NODUMP) + flags |= FS_NODUMP_FL; + return flags; +} + +STATIC int +xfs_ioc_fsgetxattr( + xfs_inode_t *ip, + int attr, + void __user *arg) +{ + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + fa.fsx_xflags = xfs_ip2xflags(ip); + fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; + fa.fsx_projid = xfs_get_projid(ip); + + if (attr) { + if (ip->i_afp) { + if (ip->i_afp->if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_afp->if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_anextents; + } else + fa.fsx_nextents = 0; + } else { + if (ip->i_df.if_flags & XFS_IFEXTENTS) + fa.fsx_nextents = ip->i_df.if_bytes / + sizeof(xfs_bmbt_rec_t); + else + fa.fsx_nextents = ip->i_d.di_nextents; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + if (copy_to_user(arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +STATIC void +xfs_set_diflags( + struct xfs_inode *ip, + unsigned int xflags) +{ + unsigned int di_flags; + + /* can't set PREALLOC this way, just preserve it */ + di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & XFS_XFLAG_IMMUTABLE) + di_flags |= XFS_DIFLAG_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + di_flags |= XFS_DIFLAG_APPEND; + if (xflags & XFS_XFLAG_SYNC) + di_flags |= XFS_DIFLAG_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + di_flags |= XFS_DIFLAG_NOATIME; + if (xflags & XFS_XFLAG_NODUMP) + di_flags |= XFS_DIFLAG_NODUMP; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + if (xflags & XFS_XFLAG_NODEFRAG) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (xflags & XFS_XFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + if (S_ISDIR(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (xflags & XFS_XFLAG_NOSYMLINKS) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if (xflags & XFS_XFLAG_EXTSZINHERIT) + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + } else if (S_ISREG(ip->i_d.di_mode)) { + if (xflags & XFS_XFLAG_REALTIME) + di_flags |= XFS_DIFLAG_REALTIME; + if (xflags & XFS_XFLAG_EXTSIZE) + di_flags |= XFS_DIFLAG_EXTSIZE; + } + + ip->i_d.di_flags = di_flags; +} + +STATIC void +xfs_diflags_to_linux( + struct xfs_inode *ip) +{ + struct inode *inode = VFS_I(ip); + unsigned int xflags = xfs_ip2xflags(ip); + + if (xflags & XFS_XFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (xflags & XFS_XFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (xflags & XFS_XFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (xflags & XFS_XFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +#define FSX_PROJID 1 +#define FSX_EXTSIZE 2 +#define FSX_XFLAGS 4 +#define FSX_NONBLOCK 8 + +STATIC int +xfs_ioctl_setattr( + xfs_inode_t *ip, + struct fsxattr *fa, + int mask) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int lock_flags = 0; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *pdqp = NULL; + struct xfs_dquot *olddquot = NULL; + int code; + + trace_xfs_ioctl_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Disallow 32bit project ids when projid32bit feature is not enabled. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + return XFS_ERROR(EINVAL); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) { + code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid, + ip->i_d.di_gid, fa->fsx_projid, + XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp); + if (code) + return code; + } + + /* + * For the other attributes, we acquire the inode lock and + * first do an error checking pass. + */ + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (code) + goto error_return; + + lock_flags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lock_flags); + + /* + * CAP_FOWNER overrides the following restrictions: + * + * The user ID of the calling process must be equal + * to the file owner ID, except in cases where the + * CAP_FSETID capability is applicable. + */ + if (!inode_owner_or_capable(VFS_I(ip))) { + code = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Do a quota reservation only if projid is actually going to change. + * Only allow changing of projid from init_user_ns since it is a + * non user namespace aware identifier. + */ + if (mask & FSX_PROJID) { + if (current_user_ns() != &init_user_ns) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + + if (XFS_IS_QUOTA_RUNNING(mp) && + XFS_IS_PQUOTA_ON(mp) && + xfs_get_projid(ip) != fa->fsx_projid) { + ASSERT(tp); + code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, + pdqp, capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (code) /* out of quota */ + goto error_return; + } + } + + if (mask & FSX_EXTSIZE) { + /* + * Can't change extent size if any extents are allocated. + */ + if (ip->i_d.di_nextents && + ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != + fa->fsx_extsize)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * Extent size must be a multiple of the appropriate block + * size, if set at all. It must also be smaller than the + * maximum extent size supported by the filesystem. + * + * Also, for non-realtime files, limit the extent size hint to + * half the size of the AGs in the filesystem so alignment + * doesn't result in extents larger than an AG. + */ + if (fa->fsx_extsize != 0) { + xfs_extlen_t size; + xfs_fsblock_t extsize_fsb; + + extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); + if (extsize_fsb > MAXEXTLEN) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + + if (XFS_IS_REALTIME_INODE(ip) || + ((mask & FSX_XFLAGS) && + (fa->fsx_xflags & XFS_XFLAG_REALTIME))) { + size = mp->m_sb.sb_rextsize << + mp->m_sb.sb_blocklog; + } else { + size = mp->m_sb.sb_blocksize; + if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + if (fa->fsx_extsize % size) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + } + + + if (mask & FSX_XFLAGS) { + /* + * Can't change realtime flag if any extents are allocated. + */ + if ((ip->i_d.di_nextents || ip->i_delayed_blks) && + (XFS_IS_REALTIME_INODE(ip)) != + (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + code = XFS_ERROR(EINVAL); /* EFBIG? */ + goto error_return; + } + + /* + * If realtime flag is set then must have realtime data. + */ + if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + if ((mp->m_sb.sb_rblocks == 0) || + (mp->m_sb.sb_rextsize == 0) || + (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + } + + /* + * Can't modify an immutable/append-only file unless + * we have appropriate permission. + */ + if ((ip->i_d.di_flags & + (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || + (fa->fsx_xflags & + (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + !capable(CAP_LINUX_IMMUTABLE)) { + code = XFS_ERROR(EPERM); + goto error_return; + } + } + + xfs_trans_ijoin(tp, ip, 0); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & FSX_PROJID) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (xfs_get_projid(ip) != fa->fsx_projid) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + olddquot = xfs_qm_vop_chown(tp, ip, + &ip->i_pdquot, pdqp); + } + ASSERT(ip->i_d.di_version > 1); + xfs_set_projid(ip, fa->fsx_projid); + } + + } + + if (mask & FSX_EXTSIZE) + ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + if (mask & FSX_XFLAGS) { + xfs_set_diflags(ip, fa->fsx_xflags); + xfs_diflags_to_linux(ip); + } + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + /* + * If this is a synchronous mount, make sure that the + * transaction goes to disk before returning to the user. + * This is slightly sub-optimal in that truncates require + * two sync transactions instead of one for wsync filesystems. + * One for the truncate and one for the timestamps since we + * don't want to change the timestamps unless we're sure the + * truncate worked. Truncates are less than 1% of the laddis + * mix so this probably isn't worth the trouble to optimize. + */ + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + code = xfs_trans_commit(tp, 0); + xfs_iunlock(ip, lock_flags); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + + return code; + + error_return: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(pdqp); + xfs_trans_cancel(tp, 0); + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return code; +} + +STATIC int +xfs_ioc_fssetxattr( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int mask; + int error; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; +} + +STATIC int +xfs_ioc_getxflags( + xfs_inode_t *ip, + void __user *arg) +{ + unsigned int flags; + + flags = xfs_di2lxflags(ip->i_d.di_flags); + if (copy_to_user(arg, &flags, sizeof(flags))) + return -EFAULT; + return 0; +} + +STATIC int +xfs_ioc_setxflags( + xfs_inode_t *ip, + struct file *filp, + void __user *arg) +{ + struct fsxattr fa; + unsigned int flags; + unsigned int mask; + int error; + + if (copy_from_user(&flags, arg, sizeof(flags))) + return -EFAULT; + + if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ + FS_NOATIME_FL | FS_NODUMP_FL | \ + FS_SYNC_FL)) + return -EOPNOTSUPP; + + mask = FSX_XFLAGS; + if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) + mask |= FSX_NONBLOCK; + fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip)); + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioctl_setattr(ip, &fa, mask); + mnt_drop_write_file(filp); + return -error; +} + +STATIC int +xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmap __user *base = *ap; + + /* copy only getbmap portion (not getbmapx) */ + if (copy_to_user(base, bmv, sizeof(struct getbmap))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmap); + return 0; +} + +STATIC int +xfs_ioc_getbmap( + struct xfs_inode *ip, + int ioflags, + unsigned int cmd, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); + if (ioflags & IO_INVIS) + bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; + + error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, + (struct getbmap *)arg+1); + if (error) + return -error; + + /* copy back header - only size of getbmap */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) +{ + struct getbmapx __user *base = *ap; + + if (copy_to_user(base, bmv, sizeof(struct getbmapx))) + return XFS_ERROR(EFAULT); + + *ap += sizeof(struct getbmapx); + return 0; +} + +STATIC int +xfs_ioc_getbmapx( + struct xfs_inode *ip, + void __user *arg) +{ + struct getbmapx bmx; + int error; + + if (copy_from_user(&bmx, arg, sizeof(bmx))) + return -XFS_ERROR(EFAULT); + + if (bmx.bmv_count < 2) + return -XFS_ERROR(EINVAL); + + if (bmx.bmv_iflags & (~BMV_IF_VALID)) + return -XFS_ERROR(EINVAL); + + error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, + (struct getbmapx *)arg+1); + if (error) + return -error; + + /* copy back header */ + if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) + return -XFS_ERROR(EFAULT); + + return 0; +} + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp) +{ + xfs_inode_t *ip, *tip; + struct fd f, tmp; + int error = 0; + + /* Pull information for the target fd */ + f = fdget((int)sxp->sx_fdtarget); + if (!f.file) { + error = XFS_ERROR(EINVAL); + goto out; + } + + if (!(f.file->f_mode & FMODE_WRITE) || + !(f.file->f_mode & FMODE_READ) || + (f.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_file; + } + + tmp = fdget((int)sxp->sx_fdtmp); + if (!tmp.file) { + error = XFS_ERROR(EINVAL); + goto out_put_file; + } + + if (!(tmp.file->f_mode & FMODE_WRITE) || + !(tmp.file->f_mode & FMODE_READ) || + (tmp.file->f_flags & O_APPEND)) { + error = XFS_ERROR(EBADF); + goto out_put_tmp_file; + } + + if (IS_SWAPFILE(file_inode(f.file)) || + IS_SWAPFILE(file_inode(tmp.file))) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + ip = XFS_I(file_inode(f.file)); + tip = XFS_I(file_inode(tmp.file)); + + if (ip->i_mount != tip->i_mount) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (ip->i_ino == tip->i_ino) { + error = XFS_ERROR(EINVAL); + goto out_put_tmp_file; + } + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + error = XFS_ERROR(EIO); + goto out_put_tmp_file; + } + + error = xfs_swap_extents(ip, tip, sxp); + + out_put_tmp_file: + fdput(tmp); + out_put_file: + fdput(f); + out: + return error; +} + +/* + * Note: some of the ioctl's return positive numbers as a + * byte count indicating success, such as readlink_by_handle. + * So we don't "sign flip" like most other routines. This means + * true errors need to be returned as a negative value. + */ +long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + trace_xfs_file_ioctl(ip); + + switch (cmd) { + case FITRIM: + return xfs_ioc_trim(mp, arg); + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { + xfs_flock64_t bf; + + if (copy_from_user(&bf, arg, sizeof(bf))) + return -XFS_ERROR(EFAULT); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_DIOINFO: { + struct dioattr da; + xfs_buftarg_t *target = + XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp; + + da.d_mem = da.d_miniosz = target->bt_logical_sectorsize; + da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); + + if (copy_to_user(arg, &da, sizeof(da))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_FSBULKSTAT_SINGLE: + case XFS_IOC_FSBULKSTAT: + case XFS_IOC_FSINUMBERS: + return xfs_ioc_bulkstat(mp, cmd, arg); + + case XFS_IOC_FSGEOMETRY_V1: + return xfs_ioc_fsgeometry_v1(mp, arg); + + case XFS_IOC_FSGEOMETRY: + return xfs_ioc_fsgeometry(mp, arg); + + case XFS_IOC_GETVERSION: + return put_user(inode->i_generation, (int __user *)arg); + + case XFS_IOC_FSGETXATTR: + return xfs_ioc_fsgetxattr(ip, 0, arg); + case XFS_IOC_FSGETXATTRA: + return xfs_ioc_fsgetxattr(ip, 1, arg); + case XFS_IOC_FSSETXATTR: + return xfs_ioc_fssetxattr(ip, filp, arg); + case XFS_IOC_GETXFLAGS: + return xfs_ioc_getxflags(ip, arg); + case XFS_IOC_SETXFLAGS: + return xfs_ioc_setxflags(ip, filp, arg); + + case XFS_IOC_FSSETDM: { + struct fsdmidata dmi; + + if (copy_from_user(&dmi, arg, sizeof(dmi))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(filp); + if (error) + return error; + + error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, + dmi.fsd_dmstate); + mnt_drop_write_file(filp); + return -error; + } + + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + return xfs_ioc_getbmap(ip, ioflags, cmd, arg); + + case XFS_IOC_GETBMAPX: + return xfs_ioc_getbmapx(ip, arg); + + case XFS_IOC_FD_TO_HANDLE: + case XFS_IOC_PATH_TO_HANDLE: + case XFS_IOC_PATH_TO_FSHANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(hreq))) + return -XFS_ERROR(EFAULT); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_FSSETDM_BY_HANDLE: + return xfs_fssetdm_by_handle(filp, arg); + + case XFS_IOC_READLINK_BY_HANDLE: { + xfs_fsop_handlereq_t hreq; + + if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE: + return xfs_attrlist_by_handle(filp, arg); + + case XFS_IOC_ATTRMULTI_BY_HANDLE: + return xfs_attrmulti_by_handle(filp, arg); + + case XFS_IOC_SWAPEXT: { + struct xfs_swapext sxp; + + if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) + return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return -error; + } + + case XFS_IOC_FSCOUNTS: { + xfs_fsop_counts_t out; + + error = xfs_fs_counts(mp, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_SET_RESBLKS: { + xfs_fsop_resblks_t inout; + __uint64_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); + + if (copy_from_user(&inout, arg, sizeof(inout))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(filp); + if (error) + return error; + + /* input parameter is passed in resblks field of structure */ + in = inout.resblks; + error = xfs_reserve_blocks(mp, &in, &inout); + mnt_drop_write_file(filp); + if (error) + return -error; + + if (copy_to_user(arg, &inout, sizeof(inout))) + return -XFS_ERROR(EFAULT); + return 0; + } + + case XFS_IOC_GET_RESBLKS: { + xfs_fsop_resblks_t out; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_reserve_blocks(mp, NULL, &out); + if (error) + return -error; + + if (copy_to_user(arg, &out, sizeof(out))) + return -XFS_ERROR(EFAULT); + + return 0; + } + + case XFS_IOC_FSGROWFSDATA: { + xfs_growfs_data_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return -error; + } + + case XFS_IOC_FSGROWFSLOG: { + xfs_growfs_log_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_log(mp, &in); + mnt_drop_write_file(filp); + return -error; + } + + case XFS_IOC_FSGROWFSRT: { + xfs_growfs_rt_t in; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return -error; + } + + case XFS_IOC_GOINGDOWN: { + __uint32_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__uint32_t __user *)arg)) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_goingdown(mp, in); + return -error; + } + + case XFS_IOC_ERROR_INJECTION: { + xfs_error_injection_t in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&in, arg, sizeof(in))) + return -XFS_ERROR(EFAULT); + + error = xfs_errortag_add(in.errtag, mp); + return -error; + } + + case XFS_IOC_ERROR_CLEARALL: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = xfs_errortag_clearall(mp, 1); + return -error; + + case XFS_IOC_FREE_EOFBLOCKS: { + struct xfs_fs_eofblocks eofb; + struct xfs_eofblocks keofb; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return -XFS_ERROR(EROFS); + + if (copy_from_user(&eofb, arg, sizeof(eofb))) + return -XFS_ERROR(EFAULT); + + error = xfs_fs_eofblocks_from_user(&eofb, &keofb); + if (error) + return -error; + + return -xfs_icache_free_eofblocks(mp, &keofb); + } + + default: + return -ENOTTY; + } +} diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h new file mode 100644 index 00000000000..77c02c7900b --- /dev/null +++ b/fs/xfs/xfs_ioctl.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2008 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL_H__ +#define __XFS_IOCTL_H__ + +extern int +xfs_ioc_space( + struct xfs_inode *ip, + struct inode *inode, + struct file *filp, + int ioflags, + unsigned int cmd, + xfs_flock64_t *bf); + +int +xfs_ioc_swapext( + xfs_swapext_t *sxp); + +extern int +xfs_find_handle( + unsigned int cmd, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_open_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_readlink_by_handle( + struct file *parfilp, + xfs_fsop_handlereq_t *hreq); + +extern int +xfs_attrmulti_attr_get( + struct inode *inode, + unsigned char *name, + unsigned char __user *ubuf, + __uint32_t *len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_set( + struct inode *inode, + unsigned char *name, + const unsigned char __user *ubuf, + __uint32_t len, + __uint32_t flags); + +extern int +xfs_attrmulti_attr_remove( + struct inode *inode, + unsigned char *name, + __uint32_t flags); + +extern struct dentry * +xfs_handle_to_dentry( + struct file *parfilp, + void __user *uhandle, + u32 hlen); + +extern long +xfs_file_ioctl( + struct file *filp, + unsigned int cmd, + unsigned long p); + +extern long +xfs_file_compat_ioctl( + struct file *file, + unsigned int cmd, + unsigned long arg); + +extern int +xfs_set_dmattrs( + struct xfs_inode *ip, + u_int evmask, + u_int16_t state); + +#endif diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c new file mode 100644 index 00000000000..944d5baa710 --- /dev/null +++ b/fs/xfs/xfs_ioctl32.c @@ -0,0 +1,681 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <linux/compat.h> +#include <linux/ioctl.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_vnode.h" +#include "xfs_inode.h" +#include "xfs_itable.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_alloc.h" +#include "xfs_rtalloc.h" +#include "xfs_attr.h" +#include "xfs_ioctl.h" +#include "xfs_ioctl32.h" +#include "xfs_trace.h" + +#define _NATIVE_IOC(cmd, type) \ + _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type)) + +#ifdef BROKEN_X86_ALIGNMENT +STATIC int +xfs_compat_flock64_copyin( + xfs_flock64_t *bf, + compat_xfs_flock64_t __user *arg32) +{ + if (get_user(bf->l_type, &arg32->l_type) || + get_user(bf->l_whence, &arg32->l_whence) || + get_user(bf->l_start, &arg32->l_start) || + get_user(bf->l_len, &arg32->l_len) || + get_user(bf->l_sysid, &arg32->l_sysid) || + get_user(bf->l_pid, &arg32->l_pid) || + copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_ioc_fsgeometry_v1( + struct xfs_mount *mp, + compat_xfs_fsop_geom_v1_t __user *arg32) +{ + xfs_fsop_geom_t fsgeo; + int error; + + error = xfs_fs_geometry(mp, &fsgeo, 3); + if (error) + return -error; + /* The 32-bit variant simply has some padding at the end */ + if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_growfs_data_copyin( + struct xfs_growfs_data *in, + compat_xfs_growfs_data_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->imaxpct, &arg32->imaxpct)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_compat_growfs_rt_copyin( + struct xfs_growfs_rt *in, + compat_xfs_growfs_rt_t __user *arg32) +{ + if (get_user(in->newblocks, &arg32->newblocks) || + get_user(in->extsize, &arg32->extsize)) + return -XFS_ERROR(EFAULT); + return 0; +} + +STATIC int +xfs_inumbers_fmt_compat( + void __user *ubuffer, + const xfs_inogrp_t *buffer, + long count, + long *written) +{ + compat_xfs_inogrp_t __user *p32 = ubuffer; + long i; + + for (i = 0; i < count; i++) { + if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || + put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || + put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) + return -XFS_ERROR(EFAULT); + } + *written = count * sizeof(*p32); + return 0; +} + +#else +#define xfs_inumbers_fmt_compat xfs_inumbers_fmt +#endif /* BROKEN_X86_ALIGNMENT */ + +STATIC int +xfs_ioctl32_bstime_copyin( + xfs_bstime_t *bstime, + compat_xfs_bstime_t __user *bstime32) +{ + compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */ + + if (get_user(sec32, &bstime32->tv_sec) || + get_user(bstime->tv_nsec, &bstime32->tv_nsec)) + return -XFS_ERROR(EFAULT); + bstime->tv_sec = sec32; + return 0; +} + +/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */ +STATIC int +xfs_ioctl32_bstat_copyin( + xfs_bstat_t *bstat, + compat_xfs_bstat_t __user *bstat32) +{ + if (get_user(bstat->bs_ino, &bstat32->bs_ino) || + get_user(bstat->bs_mode, &bstat32->bs_mode) || + get_user(bstat->bs_nlink, &bstat32->bs_nlink) || + get_user(bstat->bs_uid, &bstat32->bs_uid) || + get_user(bstat->bs_gid, &bstat32->bs_gid) || + get_user(bstat->bs_rdev, &bstat32->bs_rdev) || + get_user(bstat->bs_blksize, &bstat32->bs_blksize) || + get_user(bstat->bs_size, &bstat32->bs_size) || + xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) || + xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) || + get_user(bstat->bs_blocks, &bstat32->bs_size) || + get_user(bstat->bs_xflags, &bstat32->bs_size) || + get_user(bstat->bs_extsize, &bstat32->bs_extsize) || + get_user(bstat->bs_extents, &bstat32->bs_extents) || + get_user(bstat->bs_gen, &bstat32->bs_gen) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || + get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || + get_user(bstat->bs_aextents, &bstat32->bs_aextents)) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* XFS_IOC_FSBULKSTAT and friends */ + +STATIC int +xfs_bstime_store_compat( + compat_xfs_bstime_t __user *p32, + const xfs_bstime_t *p) +{ + __s32 sec32; + + sec32 = p->tv_sec; + if (put_user(sec32, &p32->tv_sec) || + put_user(p->tv_nsec, &p32->tv_nsec)) + return -XFS_ERROR(EFAULT); + return 0; +} + +/* Return 0 on success or positive error (to xfs_bulkstat()) */ +STATIC int +xfs_bulkstat_one_fmt_compat( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) +{ + compat_xfs_bstat_t __user *p32 = ubuffer; + + if (ubsize < sizeof(*p32)) + return XFS_ERROR(ENOMEM); + + if (put_user(buffer->bs_ino, &p32->bs_ino) || + put_user(buffer->bs_mode, &p32->bs_mode) || + put_user(buffer->bs_nlink, &p32->bs_nlink) || + put_user(buffer->bs_uid, &p32->bs_uid) || + put_user(buffer->bs_gid, &p32->bs_gid) || + put_user(buffer->bs_rdev, &p32->bs_rdev) || + put_user(buffer->bs_blksize, &p32->bs_blksize) || + put_user(buffer->bs_size, &p32->bs_size) || + xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) || + xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) || + xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) || + put_user(buffer->bs_blocks, &p32->bs_blocks) || + put_user(buffer->bs_xflags, &p32->bs_xflags) || + put_user(buffer->bs_extsize, &p32->bs_extsize) || + put_user(buffer->bs_extents, &p32->bs_extents) || + put_user(buffer->bs_gen, &p32->bs_gen) || + put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || + put_user(buffer->bs_dmstate, &p32->bs_dmstate) || + put_user(buffer->bs_aextents, &p32->bs_aextents)) + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*p32); + return 0; +} + +STATIC int +xfs_bulkstat_one_compat( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt_compat, + ubused, stat); +} + +/* copied from xfs_ioctl.c */ +STATIC int +xfs_compat_ioc_bulkstat( + xfs_mount_t *mp, + unsigned int cmd, + compat_xfs_fsop_bulkreq_t __user *p32) +{ + u32 addr; + xfs_fsop_bulkreq_t bulkreq; + int count; /* # of records returned */ + xfs_ino_t inlast; /* last inode number */ + int done; + int error; + + /* done = 1 if there are more stats to get and if bulkstat */ + /* should be called again (unused here, but used in dmapi) */ + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + if (get_user(addr, &p32->lastip)) + return -XFS_ERROR(EFAULT); + bulkreq.lastip = compat_ptr(addr); + if (get_user(bulkreq.icount, &p32->icount) || + get_user(addr, &p32->ubuffer)) + return -XFS_ERROR(EFAULT); + bulkreq.ubuffer = compat_ptr(addr); + if (get_user(addr, &p32->ocount)) + return -XFS_ERROR(EFAULT); + bulkreq.ocount = compat_ptr(addr); + + if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) + return -XFS_ERROR(EFAULT); + + if ((count = bulkreq.icount) <= 0) + return -XFS_ERROR(EINVAL); + + if (bulkreq.ubuffer == NULL) + return -XFS_ERROR(EINVAL); + + if (cmd == XFS_IOC_FSINUMBERS_32) { + error = xfs_inumbers(mp, &inlast, &count, + bulkreq.ubuffer, xfs_inumbers_fmt_compat); + } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) { + int res; + + error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer, + sizeof(compat_xfs_bstat_t), NULL, &res); + } else if (cmd == XFS_IOC_FSBULKSTAT_32) { + error = xfs_bulkstat(mp, &inlast, &count, + xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), + bulkreq.ubuffer, &done); + } else + error = XFS_ERROR(EINVAL); + if (error) + return -error; + + if (bulkreq.ocount != NULL) { + if (copy_to_user(bulkreq.lastip, &inlast, + sizeof(xfs_ino_t))) + return -XFS_ERROR(EFAULT); + + if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) + return -XFS_ERROR(EFAULT); + } + + return 0; +} + +STATIC int +xfs_compat_handlereq_copyin( + xfs_fsop_handlereq_t *hreq, + compat_xfs_fsop_handlereq_t __user *arg32) +{ + compat_xfs_fsop_handlereq_t hreq32; + + if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) + return -XFS_ERROR(EFAULT); + + hreq->fd = hreq32.fd; + hreq->path = compat_ptr(hreq32.path); + hreq->oflags = hreq32.oflags; + hreq->ihandle = compat_ptr(hreq32.ihandle); + hreq->ihandlen = hreq32.ihandlen; + hreq->ohandle = compat_ptr(hreq32.ohandle); + hreq->ohandlen = compat_ptr(hreq32.ohandlen); + + return 0; +} + +STATIC struct dentry * +xfs_compat_handlereq_to_dentry( + struct file *parfilp, + compat_xfs_fsop_handlereq_t *hreq) +{ + return xfs_handle_to_dentry(parfilp, + compat_ptr(hreq->ihandle), hreq->ihandlen); +} + +STATIC int +xfs_compat_attrlist_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + attrlist_cursor_kern_t *cursor; + compat_xfs_fsop_attrlist_handlereq_t al_hreq; + struct dentry *dentry; + char *kbuf; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&al_hreq, arg, + sizeof(compat_xfs_fsop_attrlist_handlereq_t))) + return -XFS_ERROR(EFAULT); + if (al_hreq.buflen < sizeof(struct attrlist) || + al_hreq.buflen > XATTR_LIST_MAX) + return -XFS_ERROR(EINVAL); + + /* + * Reject flags, only allow namespaces. + */ + if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) + return -XFS_ERROR(EINVAL); + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = -ENOMEM; + kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP); + if (!kbuf) + goto out_dput; + + cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; + error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + al_hreq.flags, cursor); + if (error) + goto out_kfree; + + if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen)) + error = -EFAULT; + +out_kfree: + kmem_free(kbuf); +out_dput: + dput(dentry); + return error; +} + +STATIC int +xfs_compat_attrmulti_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + compat_xfs_attr_multiop_t *ops; + compat_xfs_fsop_attrmulti_handlereq_t am_hreq; + struct dentry *dentry; + unsigned int i, size; + unsigned char *attr_name; + + if (!capable(CAP_SYS_ADMIN)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&am_hreq, arg, + sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) + return -XFS_ERROR(EFAULT); + + /* overflow check */ + if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) + return -E2BIG; + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + error = E2BIG; + size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); + if (!size || size > 16 * PAGE_SIZE) + goto out_dput; + + ops = memdup_user(compat_ptr(am_hreq.ops), size); + if (IS_ERR(ops)) { + error = -PTR_ERR(ops); + goto out_dput; + } + + error = ENOMEM; + attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); + if (!attr_name) + goto out_kfree_ops; + + error = 0; + for (i = 0; i < am_hreq.opcount; i++) { + ops[i].am_error = strncpy_from_user((char *)attr_name, + compat_ptr(ops[i].am_attrname), + MAXNAMELEN); + if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) + error = ERANGE; + if (ops[i].am_error < 0) + break; + + switch (ops[i].am_opcode) { + case ATTR_OP_GET: + ops[i].am_error = xfs_attrmulti_attr_get( + dentry->d_inode, attr_name, + compat_ptr(ops[i].am_attrvalue), + &ops[i].am_length, ops[i].am_flags); + break; + case ATTR_OP_SET: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_set( + dentry->d_inode, attr_name, + compat_ptr(ops[i].am_attrvalue), + ops[i].am_length, ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write_file(parfilp); + if (ops[i].am_error) + break; + ops[i].am_error = xfs_attrmulti_attr_remove( + dentry->d_inode, attr_name, + ops[i].am_flags); + mnt_drop_write_file(parfilp); + break; + default: + ops[i].am_error = EINVAL; + } + } + + if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) + error = XFS_ERROR(EFAULT); + + kfree(attr_name); + out_kfree_ops: + kfree(ops); + out_dput: + dput(dentry); + return -error; +} + +STATIC int +xfs_compat_fssetdm_by_handle( + struct file *parfilp, + void __user *arg) +{ + int error; + struct fsdmidata fsd; + compat_xfs_fsop_setdm_handlereq_t dmhreq; + struct dentry *dentry; + + if (!capable(CAP_MKNOD)) + return -XFS_ERROR(EPERM); + if (copy_from_user(&dmhreq, arg, + sizeof(compat_xfs_fsop_setdm_handlereq_t))) + return -XFS_ERROR(EFAULT); + + dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + error = -XFS_ERROR(EPERM); + goto out; + } + + if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { + error = -XFS_ERROR(EFAULT); + goto out; + } + + error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + fsd.fsd_dmstate); + +out: + dput(dentry); + return error; +} + +long +xfs_file_compat_ioctl( + struct file *filp, + unsigned cmd, + unsigned long p) +{ + struct inode *inode = file_inode(filp); + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + void __user *arg = (void __user *)p; + int ioflags = 0; + int error; + + if (filp->f_mode & FMODE_NOCMTIME) + ioflags |= IO_INVIS; + + trace_xfs_file_compat_ioctl(ip); + + switch (cmd) { + /* No size or alignment issues on any arch */ + case XFS_IOC_DIOINFO: + case XFS_IOC_FSGEOMETRY: + case XFS_IOC_FSGETXATTR: + case XFS_IOC_FSSETXATTR: + case XFS_IOC_FSGETXATTRA: + case XFS_IOC_FSSETDM: + case XFS_IOC_GETBMAP: + case XFS_IOC_GETBMAPA: + case XFS_IOC_GETBMAPX: + case XFS_IOC_FSCOUNTS: + case XFS_IOC_SET_RESBLKS: + case XFS_IOC_GET_RESBLKS: + case XFS_IOC_FSGROWFSLOG: + case XFS_IOC_GOINGDOWN: + case XFS_IOC_ERROR_INJECTION: + case XFS_IOC_ERROR_CLEARALL: + return xfs_file_ioctl(filp, cmd, p); +#ifndef BROKEN_X86_ALIGNMENT + /* These are handled fine if no alignment issues */ + case XFS_IOC_ALLOCSP: + case XFS_IOC_FREESP: + case XFS_IOC_RESVSP: + case XFS_IOC_UNRESVSP: + case XFS_IOC_ALLOCSP64: + case XFS_IOC_FREESP64: + case XFS_IOC_RESVSP64: + case XFS_IOC_UNRESVSP64: + case XFS_IOC_FSGEOMETRY_V1: + case XFS_IOC_FSGROWFSDATA: + case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: + return xfs_file_ioctl(filp, cmd, p); +#else + case XFS_IOC_ALLOCSP_32: + case XFS_IOC_FREESP_32: + case XFS_IOC_ALLOCSP64_32: + case XFS_IOC_FREESP64_32: + case XFS_IOC_RESVSP_32: + case XFS_IOC_UNRESVSP_32: + case XFS_IOC_RESVSP64_32: + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { + struct xfs_flock64 bf; + + if (xfs_compat_flock64_copyin(&bf, arg)) + return -XFS_ERROR(EFAULT); + cmd = _NATIVE_IOC(cmd, struct xfs_flock64); + return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); + } + case XFS_IOC_FSGEOMETRY_V1_32: + return xfs_compat_ioc_fsgeometry_v1(mp, arg); + case XFS_IOC_FSGROWFSDATA_32: { + struct xfs_growfs_data in; + + if (xfs_compat_growfs_data_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_data(mp, &in); + mnt_drop_write_file(filp); + return -error; + } + case XFS_IOC_FSGROWFSRT_32: { + struct xfs_growfs_rt in; + + if (xfs_compat_growfs_rt_copyin(&in, arg)) + return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_growfs_rt(mp, &in); + mnt_drop_write_file(filp); + return -error; + } +#endif + /* long changes size, but xfs only copiese out 32 bits */ + case XFS_IOC_GETXFLAGS_32: + case XFS_IOC_SETXFLAGS_32: + case XFS_IOC_GETVERSION_32: + cmd = _NATIVE_IOC(cmd, long); + return xfs_file_ioctl(filp, cmd, p); + case XFS_IOC_SWAPEXT_32: { + struct xfs_swapext sxp; + struct compat_xfs_swapext __user *sxu = arg; + + /* Bulk copy in up to the sx_stat field, then copy bstat */ + if (copy_from_user(&sxp, sxu, + offsetof(struct xfs_swapext, sx_stat)) || + xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) + return -XFS_ERROR(EFAULT); + error = mnt_want_write_file(filp); + if (error) + return error; + error = xfs_ioc_swapext(&sxp); + mnt_drop_write_file(filp); + return -error; + } + case XFS_IOC_FSBULKSTAT_32: + case XFS_IOC_FSBULKSTAT_SINGLE_32: + case XFS_IOC_FSINUMBERS_32: + return xfs_compat_ioc_bulkstat(mp, cmd, arg); + case XFS_IOC_FD_TO_HANDLE_32: + case XFS_IOC_PATH_TO_HANDLE_32: + case XFS_IOC_PATH_TO_FSHANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); + return xfs_find_handle(cmd, &hreq); + } + case XFS_IOC_OPEN_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_open_by_handle(filp, &hreq); + } + case XFS_IOC_READLINK_BY_HANDLE_32: { + struct xfs_fsop_handlereq hreq; + + if (xfs_compat_handlereq_copyin(&hreq, arg)) + return -XFS_ERROR(EFAULT); + return xfs_readlink_by_handle(filp, &hreq); + } + case XFS_IOC_ATTRLIST_BY_HANDLE_32: + return xfs_compat_attrlist_by_handle(filp, arg); + case XFS_IOC_ATTRMULTI_BY_HANDLE_32: + return xfs_compat_attrmulti_by_handle(filp, arg); + case XFS_IOC_FSSETDM_BY_HANDLE_32: + return xfs_compat_fssetdm_by_handle(filp, arg); + default: + return -XFS_ERROR(ENOIOCTLCMD); + } +} diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h new file mode 100644 index 00000000000..80f4060e897 --- /dev/null +++ b/fs/xfs/xfs_ioctl32.h @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2004-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_IOCTL32_H__ +#define __XFS_IOCTL32_H__ + +#include <linux/compat.h> + +/* + * on 32-bit arches, ioctl argument structures may have different sizes + * and/or alignment. We define compat structures which match the + * 32-bit sizes/alignments here, and their associated ioctl numbers. + * + * xfs_ioctl32.c contains routines to copy these structures in and out. + */ + +/* stock kernel-level ioctls we support */ +#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS +#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS +#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION + +/* + * On intel, even if sizes match, alignment and/or padding may differ. + */ +#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#define BROKEN_X86_ALIGNMENT +#define __compat_packed __attribute__((packed)) +#else +#define __compat_packed +#endif + +typedef struct compat_xfs_bstime { + compat_time_t tv_sec; /* seconds */ + __s32 tv_nsec; /* and nanoseconds */ +} compat_xfs_bstime_t; + +typedef struct compat_xfs_bstat { + __u64 bs_ino; /* inode number */ + __u16 bs_mode; /* type and mode */ + __u16 bs_nlink; /* number of links */ + __u32 bs_uid; /* user id */ + __u32 bs_gid; /* group id */ + __u32 bs_rdev; /* device value */ + __s32 bs_blksize; /* block size */ + __s64 bs_size; /* file size */ + compat_xfs_bstime_t bs_atime; /* access time */ + compat_xfs_bstime_t bs_mtime; /* modify time */ + compat_xfs_bstime_t bs_ctime; /* inode change time */ + int64_t bs_blocks; /* number of blocks */ + __u32 bs_xflags; /* extended flags */ + __s32 bs_extsize; /* extent size */ + __s32 bs_extents; /* number of extents */ + __u32 bs_gen; /* generation count */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[12]; /* pad space, unused */ + __u32 bs_dmevmask; /* DMIG event mask */ + __u16 bs_dmstate; /* DMIG state info */ + __u16 bs_aextents; /* attribute number of extents */ +} __compat_packed compat_xfs_bstat_t; + +typedef struct compat_xfs_fsop_bulkreq { + compat_uptr_t lastip; /* last inode # pointer */ + __s32 icount; /* count of entries in buffer */ + compat_uptr_t ubuffer; /* user buffer for inode desc. */ + compat_uptr_t ocount; /* output count pointer */ +} compat_xfs_fsop_bulkreq_t; + +#define XFS_IOC_FSBULKSTAT_32 \ + _IOWR('X', 101, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSBULKSTAT_SINGLE_32 \ + _IOWR('X', 102, struct compat_xfs_fsop_bulkreq) +#define XFS_IOC_FSINUMBERS_32 \ + _IOWR('X', 103, struct compat_xfs_fsop_bulkreq) + +typedef struct compat_xfs_fsop_handlereq { + __u32 fd; /* fd for FD_TO_HANDLE */ + compat_uptr_t path; /* user pathname */ + __u32 oflags; /* open flags */ + compat_uptr_t ihandle; /* user supplied handle */ + __u32 ihandlen; /* user supplied length */ + compat_uptr_t ohandle; /* user buffer for handle */ + compat_uptr_t ohandlen; /* user buffer length */ +} compat_xfs_fsop_handlereq_t; + +#define XFS_IOC_PATH_TO_FSHANDLE_32 \ + _IOWR('X', 104, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_PATH_TO_HANDLE_32 \ + _IOWR('X', 105, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_FD_TO_HANDLE_32 \ + _IOWR('X', 106, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_OPEN_BY_HANDLE_32 \ + _IOWR('X', 107, struct compat_xfs_fsop_handlereq) +#define XFS_IOC_READLINK_BY_HANDLE_32 \ + _IOWR('X', 108, struct compat_xfs_fsop_handlereq) + +/* The bstat field in the swapext struct needs translation */ +typedef struct compat_xfs_swapext { + __int64_t sx_version; /* version */ + __int64_t sx_fdtarget; /* fd of target file */ + __int64_t sx_fdtmp; /* fd of tmp file */ + xfs_off_t sx_offset; /* offset into file */ + xfs_off_t sx_length; /* leng from offset */ + char sx_pad[16]; /* pad space, unused */ + compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */ +} __compat_packed compat_xfs_swapext_t; + +#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext) + +typedef struct compat_xfs_fsop_attrlist_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */ + __u32 flags; /* which namespace to use */ + __u32 buflen; /* length of buffer supplied */ + compat_uptr_t buffer; /* returned names */ +} __compat_packed compat_xfs_fsop_attrlist_handlereq_t; + +/* Note: actually this is read/write */ +#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \ + _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq) + +/* am_opcodes defined in xfs_fs.h */ +typedef struct compat_xfs_attr_multiop { + __u32 am_opcode; + __s32 am_error; + compat_uptr_t am_attrname; + compat_uptr_t am_attrvalue; + __u32 am_length; + __u32 am_flags; +} compat_xfs_attr_multiop_t; + +typedef struct compat_xfs_fsop_attrmulti_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */ + __u32 opcount;/* count of following multiop */ + /* ptr to compat_xfs_attr_multiop */ + compat_uptr_t ops; /* attr_multi data */ +} compat_xfs_fsop_attrmulti_handlereq_t; + +#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \ + _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq) + +typedef struct compat_xfs_fsop_setdm_handlereq { + struct compat_xfs_fsop_handlereq hreq; /* handle information */ + /* ptr to struct fsdmidata */ + compat_uptr_t data; /* DMAPI data */ +} compat_xfs_fsop_setdm_handlereq_t; + +#define XFS_IOC_FSSETDM_BY_HANDLE_32 \ + _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq) + +#ifdef BROKEN_X86_ALIGNMENT +/* on ia32 l_start is on a 32-bit boundary */ +typedef struct compat_xfs_flock64 { + __s16 l_type; + __s16 l_whence; + __s64 l_start __attribute__((packed)); + /* len == 0 means until end of file */ + __s64 l_len __attribute__((packed)); + __s32 l_sysid; + __u32 l_pid; + __s32 l_pad[4]; /* reserve area */ +} compat_xfs_flock64_t; + +#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64) +#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64) +#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64) +#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) +#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) +#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) + +typedef struct compat_xfs_fsop_geom_v1 { + __u32 blocksize; /* filesystem (data) block size */ + __u32 rtextsize; /* realtime extent size */ + __u32 agblocks; /* fsblocks in an AG */ + __u32 agcount; /* number of allocation groups */ + __u32 logblocks; /* fsblocks in the log */ + __u32 sectsize; /* (data) sector size, bytes */ + __u32 inodesize; /* inode size in bytes */ + __u32 imaxpct; /* max allowed inode space(%) */ + __u64 datablocks; /* fsblocks in data subvolume */ + __u64 rtblocks; /* fsblocks in realtime subvol */ + __u64 rtextents; /* rt extents in realtime subvol*/ + __u64 logstart; /* starting fsblock of the log */ + unsigned char uuid[16]; /* unique id of the filesystem */ + __u32 sunit; /* stripe unit, fsblocks */ + __u32 swidth; /* stripe width, fsblocks */ + __s32 version; /* structure version */ + __u32 flags; /* superblock version flags */ + __u32 logsectsize; /* log sector size, bytes */ + __u32 rtsectsize; /* realtime sector size, bytes */ + __u32 dirblocksize; /* directory block size, bytes */ +} __attribute__((packed)) compat_xfs_fsop_geom_v1_t; + +#define XFS_IOC_FSGEOMETRY_V1_32 \ + _IOR('X', 100, struct compat_xfs_fsop_geom_v1) + +typedef struct compat_xfs_inogrp { + __u64 xi_startino; /* starting inode number */ + __s32 xi_alloccount; /* # bits set in allocmask */ + __u64 xi_allocmask; /* mask of allocated inodes */ +} __attribute__((packed)) compat_xfs_inogrp_t; + +/* These growfs input structures have padding on the end, so must translate */ +typedef struct compat_xfs_growfs_data { + __u64 newblocks; /* new data subvol size, fsblocks */ + __u32 imaxpct; /* new inode space percentage limit */ +} __attribute__((packed)) compat_xfs_growfs_data_t; + +typedef struct compat_xfs_growfs_rt { + __u64 newblocks; /* new realtime size, fsblocks */ + __u32 extsize; /* new realtime extent size, fsblocks */ +} __attribute__((packed)) compat_xfs_growfs_rt_t; + +#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data) +#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt) + +#endif /* BROKEN_X86_ALIGNMENT */ + +#endif /* __XFS_IOCTL32_H__ */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f1949c16df1..6d3ec2b6ee2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -17,332 +17,62 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" #include "xfs_btree.h" +#include "xfs_bmap_btree.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" +#include "xfs_bmap_util.h" #include "xfs_error.h" -#include "xfs_itable.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" -#include "xfs_utils.h" #include "xfs_iomap.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_quota.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_dinode.h" -#if defined(XFS_RW_TRACE) -void -xfs_iomap_enter_trace( - int tag, - xfs_iocore_t *io, - xfs_off_t offset, - ssize_t count) -{ - xfs_inode_t *ip = XFS_IO_INODE(io); - - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)count), - (void *)((unsigned long)((io->io_new_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(io->io_new_size & 0xffffffff)), - (void *)((unsigned long)current_pid()), - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL, - (void *)NULL); -} - -void -xfs_iomap_map_trace( - int tag, - xfs_iocore_t *io, - xfs_off_t offset, - ssize_t count, - xfs_iomap_t *iomapp, - xfs_bmbt_irec_t *imapp, - int flags) -{ - xfs_inode_t *ip = XFS_IO_INODE(io); - - if (!ip->i_rwtrace) - return; - - ktrace_enter(ip->i_rwtrace, - (void *)((unsigned long)tag), - (void *)ip, - (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), - (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), - (void *)((unsigned long)((offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(offset & 0xffffffff)), - (void *)((unsigned long)count), - (void *)((unsigned long)flags), - (void *)((unsigned long)((iomapp->iomap_offset >> 32) & 0xffffffff)), - (void *)((unsigned long)(iomapp->iomap_offset & 0xffffffff)), - (void *)((unsigned long)(iomapp->iomap_delta)), - (void *)((unsigned long)(iomapp->iomap_bsize)), - (void *)((unsigned long)(iomapp->iomap_bn)), - (void *)(__psint_t)(imapp->br_startoff), - (void *)((unsigned long)(imapp->br_blockcount)), - (void *)(__psint_t)(imapp->br_startblock)); -} -#else -#define xfs_iomap_enter_trace(tag, io, offset, count) -#define xfs_iomap_map_trace(tag, io, offset, count, iomapp, imapp, flags) -#endif #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ << mp->m_writeio_log) -#define XFS_STRAT_WRITE_IMAPS 2 #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP STATIC int -xfs_imap_to_bmap( - xfs_iocore_t *io, - xfs_off_t offset, - xfs_bmbt_irec_t *imap, - xfs_iomap_t *iomapp, - int imaps, /* Number of imap entries */ - int iomaps, /* Number of iomap entries */ - int flags) -{ - xfs_mount_t *mp; - xfs_fsize_t nisize; - int pbm; - xfs_fsblock_t start_block; - - mp = io->io_mount; - nisize = XFS_SIZE(mp, io); - if (io->io_new_size > nisize) - nisize = io->io_new_size; - - for (pbm = 0; imaps && pbm < iomaps; imaps--, iomapp++, imap++, pbm++) { - iomapp->iomap_offset = XFS_FSB_TO_B(mp, imap->br_startoff); - iomapp->iomap_delta = offset - iomapp->iomap_offset; - iomapp->iomap_bsize = XFS_FSB_TO_B(mp, imap->br_blockcount); - iomapp->iomap_flags = flags; - - if (io->io_flags & XFS_IOCORE_RT) { - iomapp->iomap_flags |= IOMAP_REALTIME; - iomapp->iomap_target = mp->m_rtdev_targp; - } else { - iomapp->iomap_target = mp->m_ddev_targp; - } - start_block = imap->br_startblock; - if (start_block == HOLESTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_HOLE; - } else if (start_block == DELAYSTARTBLOCK) { - iomapp->iomap_bn = IOMAP_DADDR_NULL; - iomapp->iomap_flags |= IOMAP_DELAY; - } else { - iomapp->iomap_bn = XFS_FSB_TO_DB_IO(io, start_block); - if (ISUNWRITTEN(imap)) - iomapp->iomap_flags |= IOMAP_UNWRITTEN; - } - - if ((iomapp->iomap_offset + iomapp->iomap_bsize) >= nisize) { - iomapp->iomap_flags |= IOMAP_EOF; - } - - offset += iomapp->iomap_bsize - iomapp->iomap_delta; - } - return pbm; /* Return the number filled */ -} - -int -xfs_iomap( - xfs_iocore_t *io, - xfs_off_t offset, - ssize_t count, - int flags, - xfs_iomap_t *iomapp, - int *niomaps) -{ - xfs_mount_t *mp = io->io_mount; - xfs_fileoff_t offset_fsb, end_fsb; - int error = 0; - int lockmode = 0; - xfs_bmbt_irec_t imap; - int nimaps = 1; - int bmapi_flags = 0; - int iomap_flags = 0; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - switch (flags & - (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE | - BMAPI_UNWRITTEN | BMAPI_DEVICE)) { - case BMAPI_READ: - xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count); - lockmode = XFS_LCK_MAP_SHARED(mp, io); - bmapi_flags = XFS_BMAPI_ENTIRE; - break; - case BMAPI_WRITE: - xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count); - lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; - if (flags & BMAPI_IGNSTATE) - bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; - XFS_ILOCK(mp, io, lockmode); - break; - case BMAPI_ALLOCATE: - xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, io, offset, count); - lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; - bmapi_flags = XFS_BMAPI_ENTIRE; - /* Attempt non-blocking lock */ - if (flags & BMAPI_TRYLOCK) { - if (!XFS_ILOCK_NOWAIT(mp, io, lockmode)) - return XFS_ERROR(EAGAIN); - } else { - XFS_ILOCK(mp, io, lockmode); - } - break; - case BMAPI_UNWRITTEN: - goto phase2; - case BMAPI_DEVICE: - lockmode = XFS_LCK_MAP_SHARED(mp, io); - iomapp->iomap_target = io->io_flags & XFS_IOCORE_RT ? - mp->m_rtdev_targp : mp->m_ddev_targp; - error = 0; - *niomaps = 1; - goto out; - default: - BUG(); - } - - ASSERT(offset <= mp->m_maxioffset); - if ((xfs_fsize_t)offset + count > mp->m_maxioffset) - count = mp->m_maxioffset - offset; - end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - - error = XFS_BMAPI(mp, NULL, io, offset_fsb, - (xfs_filblks_t)(end_fsb - offset_fsb), - bmapi_flags, NULL, 0, &imap, - &nimaps, NULL, NULL); - - if (error) - goto out; - -phase2: - switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE|BMAPI_UNWRITTEN)) { - case BMAPI_WRITE: - /* If we found an extent, return it */ - if (nimaps && - (imap.br_startblock != HOLESTARTBLOCK) && - (imap.br_startblock != DELAYSTARTBLOCK)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, - offset, count, iomapp, &imap, flags); - break; - } - - if (flags & (BMAPI_DIRECT|BMAPI_MMAP)) { - error = XFS_IOMAP_WRITE_DIRECT(mp, io, offset, - count, flags, &imap, &nimaps, nimaps); - } else { - error = XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, - flags, &imap, &nimaps); - } - if (!error) { - xfs_iomap_map_trace(XFS_IOMAP_ALLOC_MAP, io, - offset, count, iomapp, &imap, flags); - } - iomap_flags = IOMAP_NEW; - break; - case BMAPI_ALLOCATE: - /* If we found an extent, return it */ - XFS_IUNLOCK(mp, io, lockmode); - lockmode = 0; - - if (nimaps && !ISNULLSTARTBLOCK(imap.br_startblock)) { - xfs_iomap_map_trace(XFS_IOMAP_WRITE_MAP, io, - offset, count, iomapp, &imap, flags); - break; - } - - error = XFS_IOMAP_WRITE_ALLOCATE(mp, io, offset, count, - &imap, &nimaps); - break; - case BMAPI_UNWRITTEN: - lockmode = 0; - error = XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count); - nimaps = 0; - break; - } - - if (nimaps) { - *niomaps = xfs_imap_to_bmap(io, offset, &imap, - iomapp, nimaps, *niomaps, iomap_flags); - } else if (niomaps) { - *niomaps = 0; - } - -out: - if (lockmode) - XFS_IUNLOCK(mp, io, lockmode); - return XFS_ERROR(error); -} - -STATIC int xfs_iomap_eof_align_last_fsb( xfs_mount_t *mp, - xfs_iocore_t *io, - xfs_fsize_t isize, + xfs_inode_t *ip, xfs_extlen_t extsize, xfs_fileoff_t *last_fsb) { xfs_fileoff_t new_last_fsb = 0; - xfs_extlen_t align; + xfs_extlen_t align = 0; int eof, error; - if (io->io_flags & XFS_IOCORE_RT) - ; - /* - * If mounted with the "-o swalloc" option, roundup the allocation - * request to a stripe width boundary if the file size is >= - * stripe width and we are allocating past the allocation eof. - */ - else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && - (isize >= XFS_FSB_TO_B(mp, mp->m_swidth))) - new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); - /* - * Roundup the allocation request to a stripe unit (m_dalign) boundary - * if the file size is >= stripe unit size, and we are allocating past - * the allocation eof. - */ - else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign))) - new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); + if (!XFS_IS_REALTIME_INODE(ip)) { + /* + * Round up the allocation request to a stripe unit + * (m_dalign) boundary if the file size is >= stripe unit + * size, and we are allocating past the allocation eof. + * + * If mounted with the "-o swalloc" option the alignment is + * increased from the strip unit size to the stripe width. + */ + if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + align = mp->m_swidth; + else if (mp->m_dalign) + align = mp->m_dalign; + + if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align)) + new_last_fsb = roundup_64(*last_fsb, align); + } /* * Always round up the allocation request to an extent boundary @@ -357,7 +87,7 @@ xfs_iomap_eof_align_last_fsb( } if (new_last_fsb) { - error = XFS_BMAP_EOF(mp, io, new_last_fsb, XFS_DATA_FORK, &eof); + error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof); if (error) return error; if (eof) @@ -367,35 +97,20 @@ xfs_iomap_eof_align_last_fsb( } STATIC int -xfs_flush_space( +xfs_alert_fsblock_zero( xfs_inode_t *ip, - int *fsynced, - int *ioflags) + xfs_bmbt_irec_t *imap) { - switch (*fsynced) { - case 0: - if (ip->i_delayed_blks) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_inode(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 1; - } else { - *ioflags |= BMAPI_SYNC; - *fsynced = 2; - } - return 0; - case 1: - *fsynced = 2; - *ioflags |= BMAPI_SYNC; - return 0; - case 2: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_flush_device(ip); - xfs_ilock(ip, XFS_ILOCK_EXCL); - *fsynced = 3; - return 0; - } - return 1; + xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, + "Access to block zero in inode %llu " + "start_block: %llx start_off: %llx " + "blkcnt: %llx extent-state: %x", + (unsigned long long)ip->i_ino, + (unsigned long long)imap->br_startblock, + (unsigned long long)imap->br_startoff, + (unsigned long long)imap->br_blockcount, + imap->br_state); + return EFSCORRUPTED; } int @@ -403,62 +118,42 @@ xfs_iomap_write_direct( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int flags, - xfs_bmbt_irec_t *ret_imap, - int *nmaps, - int found) + xfs_bmbt_irec_t *imap, + int nmaps) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; xfs_filblks_t count_fsb, resaligned; xfs_fsblock_t firstfsb; xfs_extlen_t extsz, temp; - xfs_fsize_t isize; int nimaps; - int bmapi_flag; int quota_flag; int rt; xfs_trans_t *tp; - xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; int committed; int error; - /* - * Make sure that the dquots are there. This doesn't hold - * the ilock across a disk read. - */ - error = XFS_QM_DQATTACH(ip->i_mount, ip, XFS_QMOPT_ILOCKED); + error = xfs_qm_dqattach(ip, 0); if (error) return XFS_ERROR(error); rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - - isize = ip->i_d.di_size; - if (io->io_new_size > isize) - isize = io->io_new_size; + extsz = xfs_get_extsz_hint(ip); - offset_fsb = XFS_B_TO_FSBT(mp, offset); - last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); - if ((offset + count) > isize) { - error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, - &last_fsb); + offset_fsb = XFS_B_TO_FSBT(mp, offset); + last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); + if ((offset + count) > XFS_ISIZE(ip)) { + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - goto error_out; + return XFS_ERROR(error); } else { - if (found && (ret_imap->br_startblock == HOLESTARTBLOCK)) + if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) - ret_imap->br_blockcount + - ret_imap->br_startoff); + imap->br_blockcount + + imap->br_startoff); } count_fsb = last_fsb - offset_fsb; ASSERT(count_fsb > 0); @@ -474,118 +169,95 @@ xfs_iomap_write_direct( if (unlikely(rt)) { resrtextents = qblocks = resaligned; resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + quota_flag = XFS_QMOPT_RES_RTBLKS; + } else { + resrtextents = 0; resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } + quota_flag = XFS_QMOPT_RES_REGBLKS; + } /* * Allocate and setup the transaction */ - xfs_iunlock(ip, XFS_ILOCK_EXCL); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, resrtextents); /* * Check for running out of space, note: need lock to return */ - if (error) + if (error) { xfs_trans_cancel(tp, 0); + return XFS_ERROR(error); + } + xfs_ilock(ip, XFS_ILOCK_EXCL); - if (error) - goto error_out; - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, - qblocks, 0, quota_flag); + error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + goto out_trans_cancel; - bmapi_flag = XFS_BMAPI_WRITE; - if ((flags & BMAPI_DIRECT) && (offset < ip->i_d.di_size || extsz)) - bmapi_flag |= XFS_BMAPI_PREALLOC; + xfs_trans_ijoin(tp, ip, 0); /* - * Issue the xfs_bmapi() call to allocate the blocks + * From this point onwards we overwrite the imap pointer that the + * caller gave to us. */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, bmapi_flag, - &firstfsb, 0, &imap, &nimaps, &free_list, NULL); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_PREALLOC, &firstfsb, 0, + imap, &nimaps, &free_list); if (error) - goto error0; + goto out_bmap_cancel; /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) - goto error0; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + goto out_bmap_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) - goto error_out; + goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { - error = (ENOSPC); - goto error_out; + error = XFS_ERROR(ENOSPC); + goto out_unlock; } - *ret_imap = imap; - *nmaps = 1; - if ( !(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } - return 0; + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + error = xfs_alert_fsblock_zero(ip, imap); -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); - XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; -error1: /* Just cancel transaction */ +out_bmap_cancel: + xfs_bmap_cancel(&free_list); + xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); +out_trans_cancel: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - *nmaps = 0; /* nothing set-up here */ - -error_out: - return XFS_ERROR(error); + goto out_unlock; } /* - * If the caller is doing a write at the end of the file, - * then extend the allocation out to the file system's write - * iosize. We clean up any extra space left over when the - * file is closed in xfs_inactive(). + * If the caller is doing a write at the end of the file, then extend the + * allocation out to the file system's write iosize. We clean up any extra + * space left over when the file is closed in xfs_inactive(). * - * For sync writes, we are flushing delayed allocate space to - * try to make additional space available for allocation near - * the filesystem full boundary - preallocation hurts in that - * situation, of course. + * If we find we already have delalloc preallocation beyond EOF, don't do more + * preallocation as it it not needed. */ STATIC int xfs_iomap_eof_want_preallocate( xfs_mount_t *mp, - xfs_iocore_t *io, - xfs_fsize_t isize, + xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, xfs_bmbt_irec_t *imap, int nimaps, int *prealloc) @@ -594,9 +266,19 @@ xfs_iomap_eof_want_preallocate( xfs_filblks_t count_fsb; xfs_fsblock_t firstblock; int n, error, imaps; + int found_delalloc = 0; *prealloc = 0; - if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize) + if (offset + count <= XFS_ISIZE(ip)) + return 0; + + /* + * If the file is smaller than the minimum prealloc and we are using + * dynamic preallocation, don't do any preallocation at all as it is + * likely this is the only write to the file that is going to be done. + */ + if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) && + XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)) return 0; /* @@ -604,12 +286,12 @@ xfs_iomap_eof_want_preallocate( * do any speculative allocation. */ start_fsb = XFS_B_TO_FSBT(mp, ((xfs_ufsize_t)(offset + count - 1))); - count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); + count_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); while (count_fsb > 0) { imaps = nimaps; firstblock = NULLFSBLOCK; - error = XFS_BMAPI(mp, NULL, io, start_fsb, count_fsb, 0, - &firstblock, 0, imap, &imaps, NULL, NULL); + error = xfs_bmapi_read(ip, start_fsb, count_fsb, imap, &imaps, + 0); if (error) return error; for (n = 0; n < imaps; n++) { @@ -618,118 +300,334 @@ xfs_iomap_eof_want_preallocate( return 0; start_fsb += imap[n].br_blockcount; count_fsb -= imap[n].br_blockcount; + + if (imap[n].br_startblock == DELAYSTARTBLOCK) + found_delalloc = 1; } } - *prealloc = 1; + if (!found_delalloc) + *prealloc = 1; return 0; } +/* + * Determine the initial size of the preallocation. We are beyond the current + * EOF here, but we need to take into account whether this is a sparse write or + * an extending write when determining the preallocation size. Hence we need to + * look up the extent that ends at the current write offset and use the result + * to determine the preallocation size. + * + * If the extent is a hole, then preallocation is essentially disabled. + * Otherwise we take the size of the preceeding data extent as the basis for the + * preallocation size. If the size of the extent is greater than half the + * maximum extent length, then use the current offset as the basis. This ensures + * that for large files the preallocation size always extends to MAXEXTLEN + * rather than falling short due to things like stripe unit/width alignment of + * real extents. + */ +STATIC xfs_fsblock_t +xfs_iomap_eof_prealloc_initial_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + xfs_bmbt_irec_t *imap, + int nimaps) +{ + xfs_fileoff_t start_fsb; + int imaps = 1; + int error; + + ASSERT(nimaps >= imaps); + + /* if we are using a specific prealloc size, return now */ + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + return 0; + + /* If the file is small, then use the minimum prealloc */ + if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign)) + return 0; + + /* + * As we write multiple pages, the offset will always align to the + * start of a page and hence point to a hole at EOF. i.e. if the size is + * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096) + * will return FSB 1. Hence if there are blocks in the file, we want to + * point to the block prior to the EOF block and not the hole that maps + * directly at @offset. + */ + start_fsb = XFS_B_TO_FSB(mp, offset); + if (start_fsb) + start_fsb--; + error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE); + if (error) + return 0; + + ASSERT(imaps == 1); + if (imap[0].br_startblock == HOLESTARTBLOCK) + return 0; + if (imap[0].br_blockcount <= (MAXEXTLEN >> 1)) + return imap[0].br_blockcount << 1; + return XFS_B_TO_FSB(mp, offset); +} + +STATIC bool +xfs_quota_need_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t alloc_blocks) +{ + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + if (!dq || !xfs_this_quota_on(ip->i_mount, type)) + return false; + + /* no hi watermark, no throttle */ + if (!dq->q_prealloc_hi_wmark) + return false; + + /* under the lo watermark, no throttle */ + if (dq->q_res_bcount + alloc_blocks < dq->q_prealloc_lo_wmark) + return false; + + return true; +} + +STATIC void +xfs_quota_calc_throttle( + struct xfs_inode *ip, + int type, + xfs_fsblock_t *qblocks, + int *qshift) +{ + int64_t freesp; + int shift = 0; + struct xfs_dquot *dq = xfs_inode_dquot(ip, type); + + /* over hi wmark, squash the prealloc completely */ + if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + *qblocks = 0; + return; + } + + freesp = dq->q_prealloc_hi_wmark - dq->q_res_bcount; + if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { + shift = 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) + shift += 2; + if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) + shift += 2; + } + + /* only overwrite the throttle values if we are more aggressive */ + if ((freesp >> shift) < (*qblocks >> *qshift)) { + *qblocks = freesp; + *qshift = shift; + } +} + +/* + * If we don't have a user specified preallocation size, dynamically increase + * the preallocation size as the size of the file grows. Cap the maximum size + * at a single extent or less if the filesystem is near full. The closer the + * filesystem is to full, the smaller the maximum prealocation. + */ +STATIC xfs_fsblock_t +xfs_iomap_prealloc_size( + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_off_t offset, + struct xfs_bmbt_irec *imap, + int nimaps) +{ + xfs_fsblock_t alloc_blocks = 0; + int shift = 0; + int64_t freesp; + xfs_fsblock_t qblocks; + int qshift = 0; + + alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset, + imap, nimaps); + if (!alloc_blocks) + goto check_writeio; + qblocks = alloc_blocks; + + /* + * MAXEXTLEN is not a power of two value but we round the prealloc down + * to the nearest power of two value after throttling. To prevent the + * round down from unconditionally reducing the maximum supported prealloc + * size, we round up first, apply appropriate throttling, round down and + * cap the value to MAXEXTLEN. + */ + alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), + alloc_blocks); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + freesp = mp->m_sb.sb_fdblocks; + if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { + shift = 2; + if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) + shift++; + if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) + shift++; + } + + /* + * Check each quota to cap the prealloc size and provide a shift + * value to throttle with. + */ + if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); + if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) + xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); + + /* + * The final prealloc size is set to the minimum of free space available + * in each of the quotas and the overall filesystem. + * + * The shift throttle value is set to the maximum value as determined by + * the global low free space values and per-quota low free space values. + */ + alloc_blocks = MIN(alloc_blocks, qblocks); + shift = MAX(shift, qshift); + + if (shift) + alloc_blocks >>= shift; + /* + * rounddown_pow_of_two() returns an undefined result if we pass in + * alloc_blocks = 0. + */ + if (alloc_blocks) + alloc_blocks = rounddown_pow_of_two(alloc_blocks); + if (alloc_blocks > MAXEXTLEN) + alloc_blocks = MAXEXTLEN; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks && alloc_blocks >= freesp) + alloc_blocks >>= 4; + +check_writeio: + if (alloc_blocks < mp->m_writeio_blocks) + alloc_blocks = mp->m_writeio_blocks; + + trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, + mp->m_writeio_blocks); + + return alloc_blocks; +} + int xfs_iomap_write_delay( xfs_inode_t *ip, xfs_off_t offset, size_t count, - int ioflag, - xfs_bmbt_irec_t *ret_imap, - int *nmaps) + xfs_bmbt_irec_t *ret_imap) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_fileoff_t last_fsb; xfs_off_t aligned_offset; xfs_fileoff_t ioalign; - xfs_fsblock_t firstblock; xfs_extlen_t extsz; - xfs_fsize_t isize; int nimaps; xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; - int prealloc, fsynced = 0; + int prealloc; int error; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Make sure that the dquots are there. This doesn't hold * the ilock across a disk read. */ - error = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED); + error = xfs_qm_dqattach_locked(ip, 0); if (error) return XFS_ERROR(error); - if (XFS_IS_REALTIME_INODE(ip)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - + extsz = xfs_get_extsz_hint(ip); offset_fsb = XFS_B_TO_FSBT(mp, offset); -retry: - isize = ip->i_d.di_size; - if (io->io_new_size > isize) - isize = io->io_new_size; - - error = xfs_iomap_eof_want_preallocate(mp, io, isize, offset, count, - ioflag, imap, XFS_WRITE_IMAPS, &prealloc); + error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, + imap, XFS_WRITE_IMAPS, &prealloc); if (error) return error; +retry: if (prealloc) { + xfs_fsblock_t alloc_blocks; + + alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap, + XFS_WRITE_IMAPS); + aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); ioalign = XFS_B_TO_FSBT(mp, aligned_offset); - last_fsb = ioalign + mp->m_writeio_blocks; + last_fsb = ioalign + alloc_blocks; } else { last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); } if (prealloc || extsz) { - error = xfs_iomap_eof_align_last_fsb(mp, io, isize, extsz, - &last_fsb); + error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) return error; } + /* + * Make sure preallocation does not create extents beyond the range we + * actually support in this filesystem. + */ + if (last_fsb > XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)) + last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + + ASSERT(last_fsb > offset_fsb); + nimaps = XFS_WRITE_IMAPS; - firstblock = NULLFSBLOCK; - error = XFS_BMAPI(mp, NULL, io, offset_fsb, - (xfs_filblks_t)(last_fsb - offset_fsb), - XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | - XFS_BMAPI_ENTIRE, &firstblock, 1, imap, - &nimaps, NULL, NULL); - if (error && (error != ENOSPC)) + error = xfs_bmapi_delay(ip, offset_fsb, last_fsb - offset_fsb, + imap, &nimaps, XFS_BMAPI_ENTIRE); + switch (error) { + case 0: + case ENOSPC: + case EDQUOT: + break; + default: return XFS_ERROR(error); + } /* - * If bmapi returned us nothing, and if we didn't get back EDQUOT, - * then we must have run out of space - flush delalloc, and retry.. + * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry + * without EOF preallocation. */ if (nimaps == 0) { - xfs_iomap_enter_trace(XFS_IOMAP_WRITE_NOSPACE, - io, offset, count); - if (xfs_flush_space(ip, &fsynced, &ioflag)) - return XFS_ERROR(ENOSPC); - - error = 0; - goto retry; + trace_xfs_delalloc_enospc(ip, offset, count); + if (prealloc) { + prealloc = 0; + error = 0; + goto retry; + } + return XFS_ERROR(error ? error : ENOSPC); } - if (!(io->io_flags & XFS_IOCORE_RT) && !ret_imap->br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> inode: %lld " - "start_block : %llx start_off : %llx blkcnt : %llx " - "extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)ret_imap->br_startblock, - (unsigned long long)ret_imap->br_startoff, - (unsigned long long)ret_imap->br_blockcount, - ret_imap->br_state); - } + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap[0]); - *ret_imap = imap[0]; - *nmaps = 1; + /* + * Tag the inode as speculatively preallocated so we can reclaim this + * space on demand, if necessary. + */ + if (prealloc) + xfs_inode_set_eofblocks_tag(ip); + *ret_imap = imap[0]; return 0; } @@ -739,39 +637,37 @@ retry: * the originating callers request. * * Called without a lock on the inode. + * + * We no longer bother to look at the incoming map - all we have to + * guarantee is that whatever we allocate fills the required range. */ int xfs_iomap_write_allocate( xfs_inode_t *ip, xfs_off_t offset, - size_t count, - xfs_bmbt_irec_t *map, - int *retmap) + xfs_bmbt_irec_t *imap) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t end_fsb, map_start_fsb; xfs_fsblock_t first_block; xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; - xfs_bmbt_irec_t imap[XFS_STRAT_WRITE_IMAPS]; xfs_trans_t *tp; - int i, nimaps, committed; + int nimaps, committed; int error = 0; int nres; - *retmap = 0; - /* * Make sure that the dquots are there. */ - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) + error = xfs_qm_dqattach(ip, 0); + if (error) return XFS_ERROR(error); offset_fsb = XFS_B_TO_FSBT(mp, offset); - count_fsb = map->br_blockcount; - map_start_fsb = map->br_startoff; + count_fsb = imap->br_blockcount; + map_start_fsb = imap->br_startoff; XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); @@ -788,38 +684,57 @@ xfs_iomap_write_allocate( nimaps = 0; while (nimaps == 0) { tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); + tp->t_flags |= XFS_TRANS_RESERVE; nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_reserve(tp, nres, - XFS_WRITE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - if (error == ENOSPC) { - error = xfs_trans_reserve(tp, 0, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - } + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + nres, 0); if (error) { xfs_trans_cancel(tp, 0); return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip, 0); - XFS_BMAP_INIT(&free_list, &first_block); + xfs_bmap_init(&free_list, &first_block); - nimaps = XFS_STRAT_WRITE_IMAPS; /* - * Ensure we don't go beyond eof - it is possible - * the extents changed since we did the read call, - * we dropped the ilock in the interim. + * it is possible that the extents have changed since + * we did the read call as we dropped the ilock for a + * while. We have to be careful about truncates or hole + * punchs here - we are not allowed to allocate + * non-delalloc blocks here. + * + * The only protection against truncation is the pages + * for the range we are being asked to convert are + * locked and hence a truncate will block on them + * first. + * + * As a result, if we go beyond the range we really + * need and hit an delalloc extent boundary followed by + * a hole while we have excess blocks in the map, we + * will fill the hole incorrectly and overrun the + * transaction reservation. + * + * Using a single map prevents this as we are forced to + * check each map we look for overlap with the desired + * range and abort as soon as we find it. Also, given + * that we only return a single map, having one beyond + * what we can return is probably a bit silly. + * + * We also need to check that we don't go beyond EOF; + * this is a truncate optimisation as a truncate sets + * the new file size before block on the pages we + * currently have locked under writeback. Because they + * are about to be tossed, we don't need to write them + * back.... */ + nimaps = 1; + end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + error = xfs_bmap_last_offset(ip, &last_block, + XFS_DATA_FORK); + if (error) + goto trans_cancel; - end_fsb = XFS_B_TO_FSB(mp, ip->i_d.di_size); - xfs_bmap_last_offset(NULL, ip, &last_block, - XFS_DATA_FORK); last_block = XFS_FILEOFF_MAX(last_block, end_fsb); if ((map_start_fsb + count_fsb) > last_block) { count_fsb = last_block - map_start_fsb; @@ -829,20 +744,22 @@ xfs_iomap_write_allocate( } } - /* Go get the actual blocks */ - error = XFS_BMAPI(mp, tp, io, map_start_fsb, count_fsb, - XFS_BMAPI_WRITE, &first_block, 1, - imap, &nimaps, &free_list, NULL); + /* + * From this point onwards we overwrite the imap + * pointer that the caller gave to us. + */ + error = xfs_bmapi_write(tp, ip, map_start_fsb, + count_fsb, 0, + &first_block, 1, + imap, &nimaps, &free_list); if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, - first_block, &committed); + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto trans_cancel; - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) goto error0; @@ -853,41 +770,22 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - - for (i = 0; i < nimaps; i++) { - if (!(io->io_flags & XFS_IOCORE_RT) && - !imap[i].br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: " - "fs <%s> inode: %lld " - "start_block : %llx start_off : %llx " - "blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long) - imap[i].br_startblock, - (unsigned long long) - imap[i].br_startoff, - (unsigned long long) - imap[i].br_blockcount, - imap[i].br_state); - } - if ((offset_fsb >= imap[i].br_startoff) && - (offset_fsb < (imap[i].br_startoff + - imap[i].br_blockcount))) { - *map = imap[i]; - *retmap = 1; - XFS_STATS_INC(xs_xstrat_quick); - return 0; - } - count_fsb -= imap[i].br_blockcount; + if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, imap); + + if ((offset_fsb >= imap->br_startoff) && + (offset_fsb < (imap->br_startoff + + imap->br_blockcount))) { + XFS_STATS_INC(xs_xstrat_quick); + return 0; } - /* So far we have not mapped the requested part of the + /* + * So far we have not mapped the requested part of the * file, just surrounding data, try again. */ - nimaps--; - map_start_fsb = imap[nimaps].br_startoff + - imap[nimaps].br_blockcount; + count_fsb -= imap->br_blockcount; + map_start_fsb = imap->br_startoff + imap->br_blockcount; } trans_cancel: @@ -905,7 +803,6 @@ xfs_iomap_write_unwritten( size_t count) { xfs_mount_t *mp = ip->i_mount; - xfs_iocore_t *io = &ip->i_iocore; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; @@ -914,17 +811,27 @@ xfs_iomap_write_unwritten( xfs_trans_t *tp; xfs_bmbt_irec_t imap; xfs_bmap_free_t free_list; + xfs_fsize_t i_size; uint resblks; int committed; int error; - xfs_iomap_enter_trace(XFS_IOMAP_UNWRITTEN, - &ip->i_iocore, offset, count); + trace_xfs_unwritten_convert(ip, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); + /* + * Reserve enough blocks in this transaction for two complete extent + * btree splits. We may be converting the middle part of an unwritten + * extent and in this case we will insert two new extents in the btree + * each of which could cause a full split. + * + * This reservation amount will be used in the first call to + * xfs_bmbt_split() to select an AG with enough space to satisfy the + * rest of the operation. + */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; do { @@ -932,54 +839,63 @@ xfs_iomap_write_unwritten( * set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. + * + * Note that we open code the transaction allocation here + * to pass KM_NOFS--we can't risk to recursing back into + * the filesystem here as we might be asked to write out + * the same inode that we complete here and might deadlock + * on the iolock. */ - - tp = xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS); + tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + resblks, 0); if (error) { xfs_trans_cancel(tp, 0); - goto error0; + return XFS_ERROR(error); } xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); + xfs_trans_ijoin(tp, ip, 0); /* * Modify the unwritten extent state of the buffer. */ - XFS_BMAP_INIT(&free_list, &firstfsb); + xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; - error = XFS_BMAPI(mp, tp, io, offset_fsb, count_fsb, - XFS_BMAPI_WRITE|XFS_BMAPI_CONVERT, &firstfsb, - 1, &imap, &nimaps, &free_list, NULL); + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_CONVERT, &firstfsb, + 1, &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; - error = xfs_bmap_finish(&(tp), &(free_list), - firstfsb, &committed); + /* + * Log the updated inode size as we go. We have to be careful + * to only log it up to the actual write offset if it is + * halfway into a block. + */ + i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); + if (i_size > offset + count) + i_size = offset + count; + + i_size = xfs_new_eof(ip, i_size); + if (i_size) { + ip->i_d.di_size = i_size; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) - goto error0; - - if ( !(io->io_flags & XFS_IOCORE_RT) && !imap.br_startblock) { - cmn_err(CE_PANIC,"Access to block zero: fs <%s> " - "inode: %lld start_block : %llx start_off : " - "%llx blkcnt : %llx extent-state : %x \n", - (ip->i_mount)->m_fsname, - (long long)ip->i_ino, - (unsigned long long)imap.br_startblock, - (unsigned long long)imap.br_startoff, - (unsigned long long)imap.br_blockcount, - imap.br_state); - } + return XFS_ERROR(error); + + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) + return xfs_alert_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { /* @@ -999,6 +915,5 @@ error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); xfs_iunlock(ip, XFS_ILOCK_EXCL); -error0: return XFS_ERROR(error); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 3ce204a524b..411fbb8919e 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,73 +18,15 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -#define IOMAP_DADDR_NULL ((xfs_daddr_t) (-1LL)) - - -typedef enum { /* iomap_flags values */ - IOMAP_EOF = 0x01, /* mapping contains EOF */ - IOMAP_HOLE = 0x02, /* mapping covers a hole */ - IOMAP_DELAY = 0x04, /* mapping covers delalloc region */ - IOMAP_REALTIME = 0x10, /* mapping on the realtime device */ - IOMAP_UNWRITTEN = 0x20, /* mapping covers allocated */ - /* but uninitialized file data */ - IOMAP_NEW = 0x40 /* just allocate */ -} iomap_flags_t; - -typedef enum { - /* base extent manipulation calls */ - BMAPI_READ = (1 << 0), /* read extents */ - BMAPI_WRITE = (1 << 1), /* create extents */ - BMAPI_ALLOCATE = (1 << 2), /* delayed allocate to real extents */ - BMAPI_UNWRITTEN = (1 << 3), /* unwritten extents to real extents */ - /* modifiers */ - BMAPI_IGNSTATE = (1 << 4), /* ignore unwritten state on read */ - BMAPI_DIRECT = (1 << 5), /* direct instead of buffered write */ - BMAPI_MMAP = (1 << 6), /* allocate for mmap write */ - BMAPI_SYNC = (1 << 7), /* sync write to flush delalloc space */ - BMAPI_TRYLOCK = (1 << 8), /* non-blocking request */ - BMAPI_DEVICE = (1 << 9), /* we only want to know the device */ -} bmapi_flags_t; - - -/* - * xfs_iomap_t: File system I/O map - * - * The iomap_bn field is expressed in 512-byte blocks, and is where the - * mapping starts on disk. - * - * The iomap_offset, iomap_bsize and iomap_delta fields are in bytes. - * iomap_offset is the offset of the mapping in the file itself. - * iomap_bsize is the size of the mapping, iomap_delta is the - * desired data's offset into the mapping, given the offset supplied - * to the file I/O map routine. - * - * When a request is made to read beyond the logical end of the object, - * iomap_size may be set to 0, but iomap_offset and iomap_length should be set - * to the actual amount of underlying storage that has been allocated, if any. - */ - -typedef struct xfs_iomap { - xfs_daddr_t iomap_bn; /* first 512b blk of mapping */ - xfs_buftarg_t *iomap_target; - xfs_off_t iomap_offset; /* offset of mapping, bytes */ - xfs_off_t iomap_bsize; /* size of mapping, bytes */ - xfs_off_t iomap_delta; /* offset into mapping, bytes */ - iomap_flags_t iomap_flags; -} xfs_iomap_t; - -struct xfs_iocore; struct xfs_inode; struct xfs_bmbt_irec; -extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int, - struct xfs_iomap *, int *); -extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, - int, struct xfs_bmbt_irec *, int *, int); -extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); -extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); +int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *, int); +int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, + struct xfs_bmbt_irec *); +int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c new file mode 100644 index 00000000000..205613a0606 --- /dev/null +++ b/fs/xfs/xfs_iops.c @@ -0,0 +1,1299 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_acl.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_attr.h" +#include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_symlink.h" +#include "xfs_da_btree.h" +#include "xfs_dir2_priv.h" +#include "xfs_dinode.h" +#include "xfs_trans_space.h" + +#include <linux/capability.h> +#include <linux/xattr.h> +#include <linux/namei.h> +#include <linux/posix_acl.h> +#include <linux/security.h> +#include <linux/fiemap.h> +#include <linux/slab.h> + +/* + * Directories have different lock order w.r.t. mmap_sem compared to regular + * files. This is due to readdir potentially triggering page faults on a user + * buffer inside filldir(), and this happens with the ilock on the directory + * held. For regular files, the lock order is the other way around - the + * mmap_sem is taken during the page fault, and then we lock the ilock to do + * block mapping. Hence we need a different class for the directory ilock so + * that lockdep can tell them apart. + */ +static struct lock_class_key xfs_nondir_ilock_class; +static struct lock_class_key xfs_dir_ilock_class; + +static int +xfs_initxattrs( + struct inode *inode, + const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + struct xfs_inode *ip = XFS_I(inode); + int error = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); + if (error < 0) + break; + } + return error; +} + +/* + * Hook in SELinux. This is not quite correct yet, what we really need + * here (as we do for default ACLs) is a mechanism by which creation of + * these attrs can be journalled at inode creation time (along with the + * inode, of course, such that log replay can't cause these to be lost). + */ + +STATIC int +xfs_init_security( + struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); +} + +static void +xfs_dentry_to_name( + struct xfs_name *namep, + struct dentry *dentry, + int mode) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; +} + +STATIC void +xfs_cleanup_inode( + struct inode *dir, + struct inode *inode, + struct dentry *dentry) +{ + struct xfs_name teardown; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * xfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + xfs_dentry_to_name(&teardown, dentry, 0); + + xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); +} + +STATIC int +xfs_generic_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev, + bool tmpfile) /* unnamed file */ +{ + struct inode *inode; + struct xfs_inode *ip = NULL; + struct posix_acl *default_acl, *acl; + struct xfs_name name; + int error; + + /* + * Irix uses Missed'em'V split, but doesn't want to see + * the upper 5 bits of (14bit) major. + */ + if (S_ISCHR(mode) || S_ISBLK(mode)) { + if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) + return -EINVAL; + rdev = sysv_encode_dev(rdev); + } else { + rdev = 0; + } + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + return error; + + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } + if (unlikely(error)) + goto out_free_acl; + + inode = VFS_I(ip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + +#ifdef CONFIG_XFS_POSIX_ACL + if (default_acl) { + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto out_cleanup_inode; + } + if (acl) { + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto out_cleanup_inode; + } +#endif + + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + + out_free_acl: + if (default_acl) + posix_acl_release(default_acl); + if (acl) + posix_acl_release(acl); + return -error; + + out_cleanup_inode: + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + goto out_free_acl; +} + +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + +STATIC int +xfs_vn_create( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) +{ + return xfs_vn_mknod(dir, dentry, mode, 0); +} + +STATIC int +xfs_vn_mkdir( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); +} + +STATIC struct dentry * +xfs_vn_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *cip; + struct xfs_name name; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&name, dentry, 0); + error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + d_add(dentry, NULL); + return NULL; + } + + return d_splice_alias(VFS_I(cip), dentry); +} + +STATIC struct dentry * +xfs_vn_ci_lookup( + struct inode *dir, + struct dentry *dentry, + unsigned int flags) +{ + struct xfs_inode *ip; + struct xfs_name xname; + struct xfs_name ci_name; + struct qstr dname; + int error; + + if (dentry->d_name.len >= MAXNAMELEN) + return ERR_PTR(-ENAMETOOLONG); + + xfs_dentry_to_name(&xname, dentry, 0); + error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); + if (unlikely(error)) { + if (unlikely(error != ENOENT)) + return ERR_PTR(-error); + /* + * call d_add(dentry, NULL) here when d_drop_negative_children + * is called in xfs_vn_mknod (ie. allow negative dentries + * with CI filesystems). + */ + return NULL; + } + + /* if exact match, just splice and exit */ + if (!ci_name.name) + return d_splice_alias(VFS_I(ip), dentry); + + /* else case-insensitive match... */ + dname.name = ci_name.name; + dname.len = ci_name.len; + dentry = d_add_ci(dentry, VFS_I(ip), &dname); + kmem_free(ci_name.name); + return dentry; +} + +STATIC int +xfs_vn_link( + struct dentry *old_dentry, + struct inode *dir, + struct dentry *dentry) +{ + struct inode *inode = old_dentry->d_inode; + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, inode->i_mode); + + error = xfs_link(XFS_I(dir), XFS_I(inode), &name); + if (unlikely(error)) + return -error; + + ihold(inode); + d_instantiate(dentry, inode); + return 0; +} + +STATIC int +xfs_vn_unlink( + struct inode *dir, + struct dentry *dentry) +{ + struct xfs_name name; + int error; + + xfs_dentry_to_name(&name, dentry, 0); + + error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + if (error) + return error; + + /* + * With unlink, the VFS makes the dentry "negative": no inode, + * but still hashed. This is incompatible with case-insensitive + * mode, so invalidate (unhash) the dentry in CI-mode. + */ + if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + d_invalidate(dentry); + return 0; +} + +STATIC int +xfs_vn_symlink( + struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct xfs_inode *cip = NULL; + struct xfs_name name; + int error; + umode_t mode; + + mode = S_IFLNK | + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); + xfs_dentry_to_name(&name, dentry, mode); + + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + if (unlikely(error)) + goto out; + + inode = VFS_I(cip); + + error = xfs_init_security(inode, dir, &dentry->d_name); + if (unlikely(error)) + goto out_cleanup_inode; + + d_instantiate(dentry, inode); + return 0; + + out_cleanup_inode: + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); + out: + return -error; +} + +STATIC int +xfs_vn_rename( + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry) +{ + struct inode *new_inode = ndentry->d_inode; + struct xfs_name oname; + struct xfs_name nname; + + xfs_dentry_to_name(&oname, odentry, 0); + xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); + + return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + XFS_I(ndir), &nname, new_inode ? + XFS_I(new_inode) : NULL); +} + +/* + * careful here - this function can get called recursively, so + * we need to be very careful about how much stack we use. + * uio is kmalloced for this reason... + */ +STATIC void * +xfs_vn_follow_link( + struct dentry *dentry, + struct nameidata *nd) +{ + char *link; + int error = -ENOMEM; + + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); + if (!link) + goto out_err; + + error = -xfs_readlink(XFS_I(dentry->d_inode), link); + if (unlikely(error)) + goto out_kfree; + + nd_set_link(nd, link); + return NULL; + + out_kfree: + kfree(link); + out_err: + nd_set_link(nd, ERR_PTR(error)); + return NULL; +} + +STATIC int +xfs_vn_getattr( + struct vfsmount *mnt, + struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + trace_xfs_getattr(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return -XFS_ERROR(EIO); + + stat->size = XFS_ISIZE(ip); + stat->dev = inode->i_sb->s_dev; + stat->mode = ip->i_d.di_mode; + stat->nlink = ip->i_d.di_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->ino = ip->i_ino; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blocks = + XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); + + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + stat->blksize = BLKDEV_IOSIZE; + stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * If the file blocks are being allocated from a + * realtime volume, then return the inode's realtime + * extent size or the realtime volume's extent size. + */ + stat->blksize = + xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; + } else + stat->blksize = xfs_preferred_iosize(mp); + stat->rdev = 0; + break; + } + + return 0; +} + +static void +xfs_setattr_mode( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + +static void +xfs_setattr_time( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (iattr->ia_valid & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_CTIME) { + inode->i_ctime = iattr->ia_ctime; + ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; + ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + } +} + +int +xfs_setattr_nonsize( + struct xfs_inode *ip, + struct iattr *iattr, + int flags) +{ + xfs_mount_t *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + int mask = iattr->ia_valid; + xfs_trans_t *tp; + int error; + kuid_t uid = GLOBAL_ROOT_UID, iuid = GLOBAL_ROOT_UID; + kgid_t gid = GLOBAL_ROOT_GID, igid = GLOBAL_ROOT_GID; + struct xfs_dquot *udqp = NULL, *gdqp = NULL; + struct xfs_dquot *olddquot1 = NULL, *olddquot2 = NULL; + + trace_xfs_setattr(ip); + + /* If acls are being inherited, we already have this checked */ + if (!(flags & XFS_ATTR_NOACL)) { + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = -inode_change_ok(inode, iattr); + if (error) + return XFS_ERROR(error); + } + + ASSERT((mask & ATTR_SIZE) == 0); + + /* + * If disk quotas is on, we make sure that the dquots do exist on disk, + * before we start any other transactions. Trying to do this later + * is messy. We don't care to take a readlock to look at the ids + * in inode here, because we can't hold it across the trans_reserve. + * If the IDs do change before we take the ilock, we're covered + * because the i_*dquot fields will get updated anyway. + */ + if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) { + uint qflags = 0; + + if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) { + uid = iattr->ia_uid; + qflags |= XFS_QMOPT_UQUOTA; + } else { + uid = inode->i_uid; + } + if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) { + gid = iattr->ia_gid; + qflags |= XFS_QMOPT_GQUOTA; + } else { + gid = inode->i_gid; + } + + /* + * We take a reference when we initialize udqp and gdqp, + * so it is important that we never blindly double trip on + * the same variable. See xfs_create() for an example. + */ + ASSERT(udqp == NULL); + ASSERT(gdqp == NULL); + error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid), + xfs_kgid_to_gid(gid), + xfs_get_projid(ip), + qflags, &udqp, &gdqp, NULL); + if (error) + return error; + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); + if (error) + goto out_dqrele; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * These IDs could have changed since we last looked at them. + * But, we're assured that if the ownership did change + * while we didn't have the inode locked, inode's dquot(s) + * would have changed also. + */ + iuid = inode->i_uid; + igid = inode->i_gid; + gid = (mask & ATTR_GID) ? iattr->ia_gid : igid; + uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid; + + /* + * Do a quota reservation only if uid/gid is actually + * going to change. + */ + if (XFS_IS_QUOTA_RUNNING(mp) && + ((XFS_IS_UQUOTA_ON(mp) && !uid_eq(iuid, uid)) || + (XFS_IS_GQUOTA_ON(mp) && !gid_eq(igid, gid)))) { + ASSERT(tp); + error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, + NULL, capable(CAP_FOWNER) ? + XFS_QMOPT_FORCE_RES : 0); + if (error) /* out of quota */ + goto out_trans_cancel; + } + } + + xfs_trans_ijoin(tp, ip, 0); + + /* + * Change file ownership. Must be the owner or privileged. + */ + if (mask & (ATTR_UID|ATTR_GID)) { + /* + * CAP_FSETID overrides the following restrictions: + * + * The set-user-ID and set-group-ID bits of a file will be + * cleared upon successful return from chown() + */ + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && + !capable(CAP_FSETID)) + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); + + /* + * Change the ownerships and register quota modifications + * in the transaction. + */ + if (!uid_eq(iuid, uid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(mask & ATTR_UID); + ASSERT(udqp); + olddquot1 = xfs_qm_vop_chown(tp, ip, + &ip->i_udquot, udqp); + } + ip->i_d.di_uid = xfs_kuid_to_uid(uid); + inode->i_uid = uid; + } + if (!gid_eq(igid, gid)) { + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + !XFS_IS_PQUOTA_ON(mp)); + ASSERT(mask & ATTR_GID); + ASSERT(gdqp); + olddquot2 = xfs_qm_vop_chown(tp, ip, + &ip->i_gdquot, gdqp); + } + ip->i_d.di_gid = xfs_kgid_to_gid(gid); + inode->i_gid = gid; + } + } + + if (mask & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* + * Release any dquot(s) the inode had kept before chown. + */ + xfs_qm_dqrele(olddquot1); + xfs_qm_dqrele(olddquot2); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + + if (error) + return XFS_ERROR(error); + + /* + * XXX(hch): Updating the ACL entries is not atomic vs the i_mode + * update. We could avoid this with linked transactions + * and passing down the transaction pointer all the way + * to attr_set. No previous user of the generic + * Posix ACL code seems to care about this issue either. + */ + if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { + error = -posix_acl_chmod(inode, inode->i_mode); + if (error) + return XFS_ERROR(error); + } + + return 0; + +out_trans_cancel: + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +out_dqrele: + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + return error; +} + +/* + * Truncate file. Must have write permission and not be a directory. + */ +int +xfs_setattr_size( + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct xfs_mount *mp = ip->i_mount; + struct inode *inode = VFS_I(ip); + xfs_off_t oldsize, newsize; + struct xfs_trans *tp; + int error; + uint lock_flags = 0; + uint commit_flags = 0; + + trace_xfs_setattr(ip); + + if (mp->m_flags & XFS_MOUNT_RDONLY) + return XFS_ERROR(EROFS); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + error = -inode_change_ok(inode, iattr); + if (error) + return XFS_ERROR(error); + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(S_ISREG(ip->i_d.di_mode)); + ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + + oldsize = inode->i_size; + newsize = iattr->ia_size; + + /* + * Short circuit the truncate case for zero length files. + */ + if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) { + if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME))) + return 0; + + /* + * Use the regular setattr path to update the timestamps. + */ + iattr->ia_valid &= ~ATTR_SIZE; + return xfs_setattr_nonsize(ip, iattr, 0); + } + + /* + * Make sure that the dquots are attached to the inode. + */ + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + + /* + * Now we can make the changes. Before we join the inode to the + * transaction, take care of the part of the truncation that must be + * done without the inode lock. This needs to be done before joining + * the inode to the transaction, because the inode cannot be unlocked + * once it is a part of the transaction. + */ + if (newsize > oldsize) { + /* + * Do the first part of growing a file: zero any data in the + * last block that is beyond the old EOF. We need to do this + * before the inode is joined to the transaction to modify + * i_size. + */ + error = xfs_zero_eof(ip, newsize, oldsize); + if (error) + return error; + } + + /* + * We are going to log the inode size change in this transaction so + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files + * problem. + * + * Only flush from the on disk size to the smaller of the in memory + * file size or the new size as that's the range we really care about + * here and prevents waiting for other data not within the range we + * care about here. + */ + if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { + error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); + if (error) + return error; + } + + /* + * Wait for all direct I/O to complete. + */ + inode_dio_wait(inode); + + /* + * Do all the page cache truncate work outside the transaction context + * as the "lock" order is page lock->log space reservation. i.e. + * locking pages inside the transaction can ABBA deadlock with + * writeback. We have to do the VFS inode size update before we truncate + * the pagecache, however, to avoid racing with page faults beyond the + * new EOF they are not serialised against truncate operations except by + * page locks and size updates. + * + * Hence we are in a situation where a truncate can fail with ENOMEM + * from xfs_trans_reserve(), but having already truncated the in-memory + * version of the file (i.e. made user visible changes). There's not + * much we can do about this, except to hope that the caller sees ENOMEM + * and retries the truncate operation. + */ + error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (error) + return error; + truncate_setsize(inode, newsize); + + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto out_trans_cancel; + + commit_flags = XFS_TRANS_RELEASE_LOG_RES; + lock_flags |= XFS_ILOCK_EXCL; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Only change the c/mtime if we are changing the size or we are + * explicitly asked to change it. This handles the semantic difference + * between truncate() and ftruncate() as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && + !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; + } + + /* + * The first thing we do is set the size to new_size permanently on + * disk. This way we don't have to worry about anyone ever being able + * to look at the data being freed even in the face of a crash. + * What we're getting around here is the case where we free a block, it + * is allocated to another file, it is written to, and then we crash. + * If the new data gets written to the file but the log buffers + * containing the free and reallocation don't, then we'd end up with + * garbage in the blocks being freed. As long as we make the new size + * permanent before actually freeing any blocks it doesn't matter if + * they get written to. + */ + ip->i_d.di_size = newsize; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (newsize <= oldsize) { + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); + if (error) + goto out_trans_abort; + + /* + * Truncated "down", so we're removing references to old data + * here - if we delay flushing for a long time, we expose + * ourselves unduly to the notorious NULL files problem. So, + * we mark this inode and flush it when the file is closed, + * and do not wait the usual (long) time for writeout. + */ + xfs_iflags_set(ip, XFS_ITRUNCATED); + + /* A truncate down always removes post-EOF blocks. */ + xfs_inode_clear_eofblocks_tag(ip); + } + + if (iattr->ia_valid & ATTR_MODE) + xfs_setattr_mode(ip, iattr); + if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_setattr_time(ip, iattr); + + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + XFS_STATS_INC(xs_ig_attrchg); + + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); + + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); +out_unlock: + if (lock_flags) + xfs_iunlock(ip, lock_flags); + return error; + +out_trans_abort: + commit_flags |= XFS_TRANS_ABORT; +out_trans_cancel: + xfs_trans_cancel(tp, commit_flags); + goto out_unlock; +} + +STATIC int +xfs_vn_setattr( + struct dentry *dentry, + struct iattr *iattr) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + int error; + + if (iattr->ia_valid & ATTR_SIZE) { + xfs_ilock(ip, XFS_IOLOCK_EXCL); + error = xfs_setattr_size(ip, iattr); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + } else { + error = xfs_setattr_nonsize(ip, iattr, 0); + } + + return -error; +} + +STATIC int +xfs_vn_update_time( + struct inode *inode, + struct timespec *now, + int flags) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + trace_xfs_update_time(ip); + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return -error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (flags & S_CTIME) { + inode->i_ctime = *now; + ip->i_d.di_ctime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_ctime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_MTIME) { + inode->i_mtime = *now; + ip->i_d.di_mtime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_mtime.t_nsec = (__int32_t)now->tv_nsec; + } + if (flags & S_ATIME) { + inode->i_atime = *now; + ip->i_d.di_atime.t_sec = (__int32_t)now->tv_sec; + ip->i_d.di_atime.t_nsec = (__int32_t)now->tv_nsec; + } + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); + return -xfs_trans_commit(tp, 0); +} + +#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +/* + * Call fiemap helper to fill in user data. + * Returns positive errors to xfs_getbmap. + */ +STATIC int +xfs_fiemap_format( + void **arg, + struct getbmapx *bmv, + int *full) +{ + int error; + struct fiemap_extent_info *fieinfo = *arg; + u32 fiemap_flags = 0; + u64 logical, physical, length; + + /* Do nothing for a hole */ + if (bmv->bmv_block == -1LL) + return 0; + + logical = BBTOB(bmv->bmv_offset); + physical = BBTOB(bmv->bmv_block); + length = BBTOB(bmv->bmv_length); + + if (bmv->bmv_oflags & BMV_OF_PREALLOC) + fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN; + else if (bmv->bmv_oflags & BMV_OF_DELALLOC) { + fiemap_flags |= (FIEMAP_EXTENT_DELALLOC | + FIEMAP_EXTENT_UNKNOWN); + physical = 0; /* no block yet */ + } + if (bmv->bmv_oflags & BMV_OF_LAST) + fiemap_flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, fiemap_flags); + if (error > 0) { + error = 0; + *full = 1; /* user array now full */ + } + + return -error; +} + +STATIC int +xfs_vn_fiemap( + struct inode *inode, + struct fiemap_extent_info *fieinfo, + u64 start, + u64 length) +{ + xfs_inode_t *ip = XFS_I(inode); + struct getbmapx bm; + int error; + + error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); + if (error) + return error; + + /* Set up bmap header for xfs internal routine */ + bm.bmv_offset = BTOBB(start); + /* Special case for whole file */ + if (length == FIEMAP_MAX_OFFSET) + bm.bmv_length = -1LL; + else + bm.bmv_length = BTOBB(length); + + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : + fieinfo->fi_extents_max + 1; + bm.bmv_count = min_t(__s32, bm.bmv_count, + (PAGE_SIZE * 16 / sizeof(struct getbmapx))); + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) + bm.bmv_iflags |= BMV_IF_ATTRFORK; + if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) + bm.bmv_iflags |= BMV_IF_DELALLOC; + + error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); + if (error) + return -error; + + return 0; +} + +STATIC int +xfs_vn_tmpfile( + struct inode *dir, + struct dentry *dentry, + umode_t mode) +{ + return xfs_generic_create(dir, dentry, mode, 0, true); +} + +static const struct inode_operations xfs_inode_operations = { + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .fiemap = xfs_vn_fiemap, + .update_time = xfs_vn_update_time, +}; + +static const struct inode_operations xfs_dir_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_dir_ci_inode_operations = { + .create = xfs_vn_create, + .lookup = xfs_vn_ci_lookup, + .link = xfs_vn_link, + .unlink = xfs_vn_unlink, + .symlink = xfs_vn_symlink, + .mkdir = xfs_vn_mkdir, + /* + * Yes, XFS uses the same method for rmdir and unlink. + * + * There are some subtile differences deeper in the code, + * but we use S_ISDIR to check for those. + */ + .rmdir = xfs_vn_unlink, + .mknod = xfs_vn_mknod, + .rename = xfs_vn_rename, + .get_acl = xfs_get_acl, + .set_acl = xfs_set_acl, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, + .tmpfile = xfs_vn_tmpfile, +}; + +static const struct inode_operations xfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = xfs_vn_follow_link, + .put_link = kfree_put_link, + .getattr = xfs_vn_getattr, + .setattr = xfs_vn_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, + .listxattr = xfs_vn_listxattr, + .update_time = xfs_vn_update_time, +}; + +STATIC void +xfs_diflags_to_iflags( + struct inode *inode, + struct xfs_inode *ip) +{ + if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + inode->i_flags |= S_IMMUTABLE; + else + inode->i_flags &= ~S_IMMUTABLE; + if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; +} + +/* + * Initialize the Linux inode, set up the operation vectors and + * unlock the inode. + * + * When reading existing inodes from disk this is called directly + * from xfs_iget, when creating a new inode it is called from + * xfs_ialloc after setting up the inode. + * + * We are always called with an uninitialised linux inode here. + * We need to initialise the necessary fields and take a reference + * on it. + */ +void +xfs_setup_inode( + struct xfs_inode *ip) +{ + struct inode *inode = &ip->i_vnode; + gfp_t gfp_mask; + + inode->i_ino = ip->i_ino; + inode->i_state = I_NEW; + + inode_sb_list_add(inode); + /* make the inode look hashed for the writeback code */ + hlist_add_fake(&inode->i_hash); + + inode->i_mode = ip->i_d.di_mode; + set_nlink(inode, ip->i_d.di_nlink); + inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid); + inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid); + + switch (inode->i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = + MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff, + sysv_minor(ip->i_df.if_u2.if_rdev)); + break; + default: + inode->i_rdev = 0; + break; + } + + inode->i_generation = ip->i_d.di_gen; + i_size_write(inode, ip->i_d.di_size); + inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec; + inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec; + inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec; + inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; + inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec; + inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; + xfs_diflags_to_iflags(inode, ip); + + ip->d_ops = ip->i_mount->m_nondir_inode_ops; + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class); + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_op = &xfs_inode_operations; + inode->i_fop = &xfs_file_operations; + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + case S_IFDIR: + lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class); + if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + inode->i_op = &xfs_dir_ci_inode_operations; + else + inode->i_op = &xfs_dir_inode_operations; + inode->i_fop = &xfs_dir_file_operations; + ip->d_ops = ip->i_mount->m_dir_inode_ops; + break; + case S_IFLNK: + inode->i_op = &xfs_symlink_inode_operations; + if (!(ip->i_df.if_flags & XFS_IFINLINE)) + inode->i_mapping->a_ops = &xfs_address_space_operations; + break; + default: + inode->i_op = &xfs_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + } + + /* + * Ensure all page cache allocations are done from GFP_NOFS context to + * prevent direct reclaim recursion back into the filesystem and blowing + * stacks or deadlocking. + */ + gfp_mask = mapping_gfp_mask(inode->i_mapping); + mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + + /* + * If there is no attribute fork no ACL can exist on this inode, + * and it can't have any file capabilities attached to it either. + */ + if (!XFS_IFORK_Q(ip)) { + inode_has_no_xattr(inode); + cache_no_acl(inode); + } + + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + + unlock_new_inode(inode); +} diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/xfs_iops.h index ad6173da567..1c34e433592 100644 --- a/fs/xfs/linux-2.6/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -18,19 +18,22 @@ #ifndef __XFS_IOPS_H__ #define __XFS_IOPS_H__ -extern struct inode_operations xfs_inode_operations; -extern struct inode_operations xfs_dir_inode_operations; -extern struct inode_operations xfs_symlink_inode_operations; +struct xfs_inode; extern const struct file_operations xfs_file_operations; extern const struct file_operations xfs_dir_file_operations; -extern const struct file_operations xfs_invis_file_operations; -extern int xfs_ioctl(struct bhv_desc *, struct inode *, struct file *, - int, unsigned int, void __user *); +extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -struct xfs_inode; -extern void xfs_ichgtime(struct xfs_inode *, int); -extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int); +extern void xfs_setup_inode(struct xfs_inode *); + +/* + * Internal setattr interfaces. + */ +#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */ + +extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap, + int flags); +extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 46249e4d1fe..cb64f222d60 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -17,69 +17,88 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_btree.h" #include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" #include "xfs_itable.h" #include "xfs_error.h" -#include "xfs_btree.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" STATIC int -xfs_bulkstat_one_iget( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_daddr_t bno, /* starting bno of inode cluster */ - xfs_bstat_t *buf, /* return buffer */ - int *stat) /* BULKSTAT_RV_... */ +xfs_internal_inum( + xfs_mount_t *mp, + xfs_ino_t ino) { - xfs_dinode_core_t *dic; /* dinode core info pointer */ - xfs_inode_t *ip; /* incore inode pointer */ - bhv_vnode_t *vp; - int error; + return (ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || + (xfs_sb_version_hasquota(&mp->m_sb) && + xfs_is_quota_inode(&mp->m_sb, ino))); +} + +/* + * Return stat information for one inode. + * Return 0 if ok, else errno. + */ +int +xfs_bulkstat_one_int( + struct xfs_mount *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode to get data for */ + void __user *buffer, /* buffer to place output in */ + int ubsize, /* size of buffer */ + bulkstat_one_fmt_pf formatter, /* formatter, copy to user */ + int *ubused, /* bytes used by me */ + int *stat) /* BULKSTAT_RV_... */ +{ + struct xfs_icdinode *dic; /* dinode core info pointer */ + struct xfs_inode *ip; /* incore inode pointer */ + struct xfs_bstat *buf; /* return buffer */ + int error = 0; /* error value */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, bno); + *stat = BULKSTAT_RV_NOTHING; + + if (!buffer || xfs_internal_inum(mp, ino)) + return XFS_ERROR(EINVAL); + + buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); + if (!buf) + return XFS_ERROR(ENOMEM); + + error = xfs_iget(mp, NULL, ino, + (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), + XFS_ILOCK_SHARED, &ip); if (error) { *stat = BULKSTAT_RV_NOTHING; - return error; + goto out_free; } ASSERT(ip != NULL); - ASSERT(ip->i_blkno != (xfs_daddr_t)0); - if (ip->i_d.di_mode == 0) { - *stat = BULKSTAT_RV_NOTHING; - error = XFS_ERROR(ENOENT); - goto out_iput; - } + ASSERT(ip->i_imap.im_blkno != 0); - vp = XFS_ITOV(ip); dic = &ip->i_d; /* xfs_iget returns the following without needing * further change. */ buf->bs_nlink = dic->di_nlink; - buf->bs_projid = dic->di_projid; + buf->bs_projid_lo = dic->di_projid_lo; + buf->bs_projid_hi = dic->di_projid_hi; buf->bs_ino = ino; buf->bs_mode = dic->di_mode; buf->bs_uid = dic->di_uid; buf->bs_gid = dic->di_gid; buf->bs_size = dic->di_size; - vn_atime_to_bstime(vp, &buf->bs_atime); + buf->bs_atime.tv_sec = dic->di_atime.t_sec; + buf->bs_atime.tv_nsec = dic->di_atime.t_nsec; buf->bs_mtime.tv_sec = dic->di_mtime.t_sec; buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec; buf->bs_ctime.tv_sec = dic->di_ctime.t_sec; @@ -92,6 +111,7 @@ xfs_bulkstat_one_iget( buf->bs_dmevmask = dic->di_dmevmask; buf->bs_dmstate = dic->di_dmstate; buf->bs_aextents = dic->di_anextents; + buf->bs_forkoff = XFS_IFORK_BOFF(ip); switch (dic->di_format) { case XFS_DINODE_FMT_DEV: @@ -112,146 +132,51 @@ xfs_bulkstat_one_iget( buf->bs_blocks = dic->di_nblocks + ip->i_delayed_blks; break; } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + IRELE(ip); + + error = formatter(buffer, ubsize, ubused, buf); - out_iput: - xfs_iput(ip, XFS_ILOCK_SHARED); + if (!error) + *stat = BULKSTAT_RV_DIDONE; + + out_free: + kmem_free(buf); return error; } +/* Return 0 on success or positive error */ STATIC int -xfs_bulkstat_one_dinode( - xfs_mount_t *mp, /* mount point for filesystem */ - xfs_ino_t ino, /* inode number to get data for */ - xfs_dinode_t *dip, /* dinode inode pointer */ - xfs_bstat_t *buf) /* return buffer */ +xfs_bulkstat_one_fmt( + void __user *ubuffer, + int ubsize, + int *ubused, + const xfs_bstat_t *buffer) { - xfs_dinode_core_t *dic; /* dinode core info pointer */ - - dic = &dip->di_core; - - /* - * The inode format changed when we moved the link count and - * made it 32 bits long. If this is an old format inode, - * convert it in memory to look like a new one. If it gets - * flushed to disk we will convert back before flushing or - * logging it. We zero out the new projid field and the old link - * count field. We'll handle clearing the pad field (the remains - * of the old uuid field) when we actually convert the inode to - * the new format. We don't change the version number so that we - * can distinguish this from a real new format inode. - */ - if (INT_GET(dic->di_version, ARCH_CONVERT) == XFS_DINODE_VERSION_1) { - buf->bs_nlink = INT_GET(dic->di_onlink, ARCH_CONVERT); - buf->bs_projid = 0; - } else { - buf->bs_nlink = INT_GET(dic->di_nlink, ARCH_CONVERT); - buf->bs_projid = INT_GET(dic->di_projid, ARCH_CONVERT); - } - - buf->bs_ino = ino; - buf->bs_mode = INT_GET(dic->di_mode, ARCH_CONVERT); - buf->bs_uid = INT_GET(dic->di_uid, ARCH_CONVERT); - buf->bs_gid = INT_GET(dic->di_gid, ARCH_CONVERT); - buf->bs_size = INT_GET(dic->di_size, ARCH_CONVERT); - buf->bs_atime.tv_sec = INT_GET(dic->di_atime.t_sec, ARCH_CONVERT); - buf->bs_atime.tv_nsec = INT_GET(dic->di_atime.t_nsec, ARCH_CONVERT); - buf->bs_mtime.tv_sec = INT_GET(dic->di_mtime.t_sec, ARCH_CONVERT); - buf->bs_mtime.tv_nsec = INT_GET(dic->di_mtime.t_nsec, ARCH_CONVERT); - buf->bs_ctime.tv_sec = INT_GET(dic->di_ctime.t_sec, ARCH_CONVERT); - buf->bs_ctime.tv_nsec = INT_GET(dic->di_ctime.t_nsec, ARCH_CONVERT); - buf->bs_xflags = xfs_dic2xflags(dic); - buf->bs_extsize = INT_GET(dic->di_extsize, ARCH_CONVERT) << mp->m_sb.sb_blocklog; - buf->bs_extents = INT_GET(dic->di_nextents, ARCH_CONVERT); - buf->bs_gen = INT_GET(dic->di_gen, ARCH_CONVERT); - memset(buf->bs_pad, 0, sizeof(buf->bs_pad)); - buf->bs_dmevmask = INT_GET(dic->di_dmevmask, ARCH_CONVERT); - buf->bs_dmstate = INT_GET(dic->di_dmstate, ARCH_CONVERT); - buf->bs_aextents = INT_GET(dic->di_anextents, ARCH_CONVERT); - - switch (INT_GET(dic->di_format, ARCH_CONVERT)) { - case XFS_DINODE_FMT_DEV: - buf->bs_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); - buf->bs_blksize = BLKDEV_IOSIZE; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_LOCAL: - case XFS_DINODE_FMT_UUID: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = 0; - break; - case XFS_DINODE_FMT_EXTENTS: - case XFS_DINODE_FMT_BTREE: - buf->bs_rdev = 0; - buf->bs_blksize = mp->m_sb.sb_blocksize; - buf->bs_blocks = INT_GET(dic->di_nblocks, ARCH_CONVERT); - break; - } - + if (ubsize < sizeof(*buffer)) + return XFS_ERROR(ENOMEM); + if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) + return XFS_ERROR(EFAULT); + if (ubused) + *ubused = sizeof(*buffer); return 0; } -/* - * Return stat information for one inode. - * Return 0 if ok, else errno. - */ -int /* error status */ +int xfs_bulkstat_one( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t ino, /* inode number to get data for */ void __user *buffer, /* buffer to place output in */ int ubsize, /* size of buffer */ - void *private_data, /* my private data */ - xfs_daddr_t bno, /* starting bno of inode cluster */ int *ubused, /* bytes used by me */ - void *dibuff, /* on-disk inode buffer */ int *stat) /* BULKSTAT_RV_... */ { - xfs_bstat_t *buf; /* return buffer */ - int error = 0; /* error value */ - xfs_dinode_t *dip; /* dinode inode pointer */ - - dip = (xfs_dinode_t *)dibuff; - - if (!buffer || ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (XFS_SB_VERSION_HASQUOTA(&mp->m_sb) && - (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino))) { - *stat = BULKSTAT_RV_NOTHING; - return XFS_ERROR(EINVAL); - } - if (ubsize < sizeof(*buf)) { - *stat = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOMEM); - } - - buf = kmem_alloc(sizeof(*buf), KM_SLEEP); - - if (dip == NULL) { - /* We're not being passed a pointer to a dinode. This happens - * if BULKSTAT_FG_IGET is selected. Do the iget. - */ - error = xfs_bulkstat_one_iget(mp, ino, bno, buf, stat); - if (error) - goto out_free; - } else { - xfs_bulkstat_one_dinode(mp, ino, dip, buf); - } - - if (copy_to_user(buffer, buf, sizeof(*buf))) { - *stat = BULKSTAT_RV_NOTHING; - error = EFAULT; - goto out_free; - } - - *stat = BULKSTAT_RV_DIDONE; - if (ubused) - *ubused = sizeof(*buf); - - out_free: - kmem_free(buf, sizeof(*buf)); - return error; + return xfs_bulkstat_one_int(mp, ino, buffer, ubsize, + xfs_bulkstat_one_fmt, ubused, stat); } +#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) + /* * Return stat information in bulk (by-inode) for the filesystem. */ @@ -261,10 +186,8 @@ xfs_bulkstat( xfs_ino_t *lastinop, /* last inode returned */ int *ubcountp, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data,/* private data for formatter */ size_t statstruct_size, /* sizeof struct filling */ char __user *ubuffer, /* buffer with inode stats */ - int flags, /* defined in xfs_itable.h */ int *done) /* 1 if there are more stats to get */ { xfs_agblock_t agbno=0;/* allocation group block number */ @@ -272,26 +195,22 @@ xfs_bulkstat( xfs_agi_t *agi; /* agi header data */ xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ - xfs_daddr_t bno; /* inode cluster start daddr */ int chunkidx; /* current index into inode chunk */ int clustidx; /* current index into inode cluster */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ int end_of_ag; /* set if we've seen the ag end */ int error; /* error code */ int fmterror;/* bulkstat formatter result */ - __int32_t gcnt; /* current btree rec's count */ - xfs_inofree_t gfree; /* current btree rec's free mask */ - xfs_agino_t gino; /* current btree rec's start inode */ int i; /* loop index */ int icount; /* count of inodes good in irbuf */ + size_t irbsize; /* size of irec buffer in bytes */ xfs_ino_t ino; /* inode number (filesystem) */ - xfs_inobt_rec_t *irbp; /* current irec buffer pointer */ - xfs_inobt_rec_t *irbuf; /* start of irec buffer */ - xfs_inobt_rec_t *irbufend; /* end of good irec buffer entries */ - xfs_ino_t lastino=0; /* last inode number returned */ - int nbcluster; /* # of blocks in a cluster */ - int nicluster; /* # of inodes in a cluster */ - int nimask; /* mask for inode clusters */ + xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */ + xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ + xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ + xfs_ino_t lastino; /* last inode number returned */ + int blks_per_cluster; /* # of blocks per cluster */ + int inodes_per_cluster;/* # of inodes per cluster */ int nirbuf; /* size of irbuf */ int rval; /* return value error code */ int tmp; /* result value from btree calls */ @@ -300,15 +219,12 @@ xfs_bulkstat( char __user *ubufp; /* pointer into user's buffer */ int ubelem; /* spaces used in user's buffer */ int ubused; /* bytes used by formatter */ - xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */ - xfs_dinode_t *dip; /* ptr into bp for specific inode */ - xfs_inode_t *ip; /* ptr to in-core inode struct */ /* * Get the last inode value, see if there's nothing to do. */ ino = (xfs_ino_t)*lastinop; - dip = NULL; + lastino = ino; agno = XFS_INO_TO_AGNO(mp, ino); agino = XFS_INO_TO_AGINO(mp, ino); if (agno >= mp->m_sb.sb_agcount || @@ -317,34 +233,31 @@ xfs_bulkstat( *ubcountp = 0; return 0; } + if (!ubcountp || *ubcountp <= 0) { + return EINVAL; + } ubcount = *ubcountp; /* statstruct's */ ubleft = ubcount * statstruct_size; /* bytes */ *ubcountp = ubelem = 0; *done = 0; fmterror = 0; ubufp = ubuffer; - nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ? - mp->m_sb.sb_inopblock : - (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog); - nimask = ~(nicluster - 1); - nbcluster = nicluster >> mp->m_sb.sb_inopblog; - /* - * Allocate a page-sized buffer for inode btree records. - * We could try allocating something smaller, but for normal - * calls we'll always (potentially) need the whole page. - */ - irbuf = kmem_alloc(NBPC, KM_SLEEP); - nirbuf = NBPC / sizeof(*irbuf); + blks_per_cluster = xfs_icluster_size_fsb(mp); + inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; + irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + if (!irbuf) + return ENOMEM; + + nirbuf = irbsize / sizeof(*irbuf); + /* * Loop over the allocation groups, starting from the last * inode returned; 0 means start of the allocation group. */ rval = 0; - while (ubleft >= statstruct_size && agno < mp->m_sb.sb_agcount) { - bp = NULL; - down_read(&mp->m_peraglock); + while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { + cond_resched(); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * Skip this allocation group and go to the next one. @@ -357,8 +270,8 @@ xfs_bulkstat( /* * Allocate and initialize a btree cursor for ialloc btree. */ - cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO, - (xfs_inode_t *)0, 0); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); irbp = irbuf; irbufend = irbuf + nirbuf; end_of_ag = 0; @@ -367,40 +280,43 @@ xfs_bulkstat( * we need to get the remainder of the chunk we're in. */ if (agino > 0) { + xfs_inobt_rec_incore_t r; + /* * Lookup the inode chunk that this inode lives in. */ - error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, + &tmp); if (!error && /* no I/O error */ tmp && /* lookup succeeded */ /* got the record, should always work */ - !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) && + !(error = xfs_inobt_get_rec(cur, &r, &i)) && i == 1 && /* this is the right chunk */ - agino < gino + XFS_INODES_PER_CHUNK && + agino < r.ir_startino + XFS_INODES_PER_CHUNK && /* lastino was not last in chunk */ - (chunkidx = agino - gino + 1) < + (chunkidx = agino - r.ir_startino + 1) < XFS_INODES_PER_CHUNK && /* there are some left allocated */ - XFS_INOBT_MASKN(chunkidx, - XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { + xfs_inobt_maskn(chunkidx, + XFS_INODES_PER_CHUNK - chunkidx) & + ~r.ir_free) { /* * Grab the chunk record. Mark all the * uninteresting inodes (because they're * before our start point) free. */ for (i = 0; i < chunkidx; i++) { - if (XFS_INOBT_MASK(i) & ~gfree) - gcnt++; + if (XFS_INOBT_MASK(i) & ~r.ir_free) + r.ir_freecount++; } - gfree |= XFS_INOBT_MASKN(0, chunkidx); - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + r.ir_free |= xfs_inobt_maskn(0, chunkidx); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - agino = gino + XFS_INODES_PER_CHUNK; - icount = XFS_INODES_PER_CHUNK - gcnt; + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + icount = XFS_INODES_PER_CHUNK - r.ir_freecount; } else { /* * If any of those tests failed, bump the @@ -413,12 +329,12 @@ xfs_bulkstat( * In any case, increment to the next record. */ if (!error) - error = xfs_inobt_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &tmp); } else { /* * Start of ag. Lookup the first inode chunk. */ - error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); + error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); icount = 0; } /* @@ -426,6 +342,8 @@ xfs_bulkstat( * until we run out of inodes or space in the buffer. */ while (irbp < irbufend && icount < ubcount) { + xfs_inobt_rec_incore_t r; + /* * Loop as long as we're unable to read the * inode btree. @@ -435,35 +353,61 @@ xfs_bulkstat( if (XFS_AGINO_TO_AGBNO(mp, agino) >= be32_to_cpu(agi->agi_length)) break; - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, - &tmp); + error = xfs_inobt_lookup(cur, agino, + XFS_LOOKUP_GE, &tmp); + cond_resched(); } /* * If ran off the end of the ag either with an error, * or the normal way, set end and stop collecting. */ - if (error || - (error = xfs_inobt_get_rec(cur, &gino, &gcnt, - &gfree, &i)) || - i == 0) { + if (error) { end_of_ag = 1; break; } + + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { + end_of_ag = 1; + break; + } + /* * If this chunk has any allocated inodes, save it. + * Also start read-ahead now for this chunk. */ - if (gcnt < XFS_INODES_PER_CHUNK) { - INT_SET(irbp->ir_startino, ARCH_CONVERT, gino); - INT_SET(irbp->ir_freecount, ARCH_CONVERT, gcnt); - INT_SET(irbp->ir_free, ARCH_CONVERT, gfree); + if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + struct blk_plug plug; + /* + * Loop over all clusters in the next chunk. + * Do a readahead if there are any allocated + * inodes in that cluster. + */ + blk_start_plug(&plug); + agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino); + for (chunkidx = 0; + chunkidx < XFS_INODES_PER_CHUNK; + chunkidx += inodes_per_cluster, + agbno += blks_per_cluster) { + if (xfs_inobt_maskn(chunkidx, + inodes_per_cluster) & ~r.ir_free) + xfs_btree_reada_bufs(mp, agno, + agbno, blks_per_cluster, + &xfs_inode_buf_ops); + } + blk_finish_plug(&plug); + irbp->ir_startino = r.ir_startino; + irbp->ir_freecount = r.ir_freecount; + irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - gcnt; + icount += XFS_INODES_PER_CHUNK - r.ir_freecount; } /* * Set agino to after this chunk and bump the cursor. */ - agino = gino + XFS_INODES_PER_CHUNK; - error = xfs_inobt_increment(cur, 0, &tmp); + agino = r.ir_startino + XFS_INODES_PER_CHUNK; + error = xfs_btree_increment(cur, 0, &tmp); + cond_resched(); } /* * Drop the btree buffers and the agi buffer. @@ -477,128 +421,44 @@ xfs_bulkstat( */ irbufend = irbp; for (irbp = irbuf; - irbp < irbufend && ubleft >= statstruct_size; irbp++) { - /* - * Read-ahead the next chunk's worth of inodes. - */ - if (&irbp[1] < irbufend) { - /* - * Loop over all clusters in the next chunk. - * Do a readahead if there are any allocated - * inodes in that cluster. - */ - for (agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp[1].ir_startino, ARCH_CONVERT)), - chunkidx = 0; - chunkidx < XFS_INODES_PER_CHUNK; - chunkidx += nicluster, - agbno += nbcluster) { - if (XFS_INOBT_MASKN(chunkidx, - nicluster) & - ~(INT_GET(irbp[1].ir_free, ARCH_CONVERT))) - xfs_btree_reada_bufs(mp, agno, - agbno, nbcluster); - } - } + irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { /* * Now process this chunk of inodes. */ - for (agino = INT_GET(irbp->ir_startino, ARCH_CONVERT), chunkidx = 0, clustidx = 0; - ubleft > 0 && - INT_GET(irbp->ir_freecount, ARCH_CONVERT) < XFS_INODES_PER_CHUNK; + for (agino = irbp->ir_startino, chunkidx = clustidx = 0; + XFS_BULKSTAT_UBLEFT(ubleft) && + irbp->ir_freecount < XFS_INODES_PER_CHUNK; chunkidx++, clustidx++, agino++) { ASSERT(chunkidx < XFS_INODES_PER_CHUNK); - /* - * Recompute agbno if this is the - * first inode of the cluster. - * - * Careful with clustidx. There can be - * multple clusters per chunk, a single - * cluster per chunk or a cluster that has - * inodes represented from several different - * chunks (if blocksize is large). - * - * Because of this, the starting clustidx is - * initialized to zero in this loop but must - * later be reset after reading in the cluster - * buffer. - */ - if ((chunkidx & (nicluster - 1)) == 0) { - agbno = XFS_AGINO_TO_AGBNO(mp, - INT_GET(irbp->ir_startino, ARCH_CONVERT)) + - ((chunkidx & nimask) >> - mp->m_sb.sb_inopblog); - - if (flags & BULKSTAT_FG_QUICK) { - ino = XFS_AGINO_TO_INO(mp, agno, - agino); - bno = XFS_AGB_TO_DADDR(mp, agno, - agbno); - - /* - * Get the inode cluster buffer - */ - ASSERT(xfs_inode_zone != NULL); - ip = kmem_zone_zalloc(xfs_inode_zone, - KM_SLEEP); - ip->i_ino = ino; - ip->i_mount = mp; - if (bp) - xfs_buf_relse(bp); - error = xfs_itobp(mp, NULL, ip, - &dip, &bp, bno, - XFS_IMAP_BULKSTAT); - if (!error) - clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; - kmem_zone_free(xfs_inode_zone, ip); - if (XFS_TEST_ERROR(error != 0, - mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK, - XFS_RANDOM_BULKSTAT_READ_CHUNK)) { - bp = NULL; - ubleft = 0; - rval = error; - break; - } - } - } + + ino = XFS_AGINO_TO_INO(mp, agno, agino); /* * Skip if this inode is free. */ - if (XFS_INOBT_MASK(chunkidx) & INT_GET(irbp->ir_free, ARCH_CONVERT)) + if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { + lastino = ino; continue; + } /* * Count used inodes as free so we can tell * when the chunk is used up. */ - INT_MOD(irbp->ir_freecount, ARCH_CONVERT, +1); - ino = XFS_AGINO_TO_INO(mp, agno, agino); - bno = XFS_AGB_TO_DADDR(mp, agno, agbno); - if (flags & BULKSTAT_FG_QUICK) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - (clustidx << mp->m_sb.sb_inodelog)); - - if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) - != XFS_DINODE_MAGIC - || !XFS_DINODE_GOOD_VERSION( - INT_GET(dip->di_core.di_version, ARCH_CONVERT))) - continue; - } + irbp->ir_freecount++; /* * Get the inode and fill in a single buffer. - * BULKSTAT_FG_QUICK uses dip to fill it in. - * BULKSTAT_FG_IGET uses igets. - * See: xfs_bulkstat_one & xfs_dm_bulkstat_one. - * This is also used to count inodes/blks, etc - * in xfs_qm_quotacheck. */ ubused = statstruct_size; - error = formatter(mp, ino, ubufp, - ubleft, private_data, - bno, &ubused, dip, &fmterror); + error = formatter(mp, ino, ubufp, ubleft, + &ubused, &fmterror); if (fmterror == BULKSTAT_RV_NOTHING) { - if (error == ENOMEM) + if (error && error != ENOENT && + error != EINVAL) { ubleft = 0; + rval = error; + break; + } + lastino = ino; continue; } if (fmterror == BULKSTAT_RV_GIVEUP) { @@ -613,15 +473,13 @@ xfs_bulkstat( ubelem++; lastino = ino; } - } - - if (bp) - xfs_buf_relse(bp); + cond_resched(); + } /* * Set up for the next loop iteration. */ - if (ubleft > 0) { + if (XFS_BULKSTAT_UBLEFT(ubleft)) { if (end_of_ag) { agno++; agino = 0; @@ -633,8 +491,13 @@ xfs_bulkstat( /* * Done, we're either out of filesystem or space to put the data. */ - kmem_free(irbuf, NBPC); + kmem_free(irbuf); *ubcountp = ubelem; + /* + * Found some inodes, return them now and return the error next time. + */ + if (ubelem) + rval = 0; if (agno >= mp->m_sb.sb_agcount) { /* * If we ran out of filesystem, mark lastino as off @@ -667,16 +530,16 @@ xfs_bulkstat_single( /* * note that requesting valid inode numbers which are not allocated - * to inodes will most likely cause xfs_itobp to generate warning + * to inodes will most likely cause xfs_imap_to_bp to generate warning * messages about bad magic numbers. This is ok. The fact that * the inode isn't actually an inode is handled by the * error check below. Done this way to make the usual case faster * at the expense of the error case. */ - ino = (xfs_ino_t)*lastinop; + ino = *lastinop; error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), - NULL, 0, NULL, NULL, &res); + NULL, &res); if (error) { /* * Special case way failed, do it the "long" way @@ -685,8 +548,7 @@ xfs_bulkstat_single( (*lastinop)--; count = 1; if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one, - NULL, sizeof(xfs_bstat_t), buffer, - BULKSTAT_FG_IGET, done)) + sizeof(xfs_bstat_t), buffer, done)) return error; if (count == 0 || (xfs_ino_t)*lastinop != ino) return error == EFSCORRUPTED ? @@ -698,6 +560,19 @@ xfs_bulkstat_single( return 0; } +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written) /* # of bytes written */ +{ + if (copy_to_user(ubuffer, buffer, count * sizeof(*buffer))) + return -EFAULT; + *written = count * sizeof(*buffer); + return 0; +} + /* * Return inode number table for the filesystem. */ @@ -706,7 +581,8 @@ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *ubuffer)/* buffer with inode descriptions */ + void __user *ubuffer,/* buffer with inode descriptions */ + inumbers_fmt_pf formatter) { xfs_buf_t *agbp; xfs_agino_t agino; @@ -716,9 +592,7 @@ xfs_inumbers( int bufidx; xfs_btree_cur_t *cur; int error; - __int32_t gcnt; - xfs_inofree_t gfree; - xfs_agino_t gino; + xfs_inobt_rec_incore_t r; int i; xfs_ino_t ino; int left; @@ -729,16 +603,14 @@ xfs_inumbers( agino = XFS_INO_TO_AGINO(mp, ino); left = *count; *count = 0; - bcount = MIN(left, (int)(NBPP / sizeof(*buffer))); + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); error = bufidx = 0; cur = NULL; agbp = NULL; while (left > 0 && agno < mp->m_sb.sb_agcount) { if (agbp == NULL) { - down_read(&mp->m_peraglock); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); - up_read(&mp->m_peraglock); if (error) { /* * If we can't read the AGI of this ag, @@ -750,16 +622,17 @@ xfs_inumbers( agino = 0; continue; } - cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, - XFS_BTNUM_INO, (xfs_inode_t *)0, 0); - error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, + XFS_BTNUM_INO); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, + &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); cur = NULL; xfs_buf_relse(agbp); agbp = NULL; /* - * Move up the the last inode in the current + * Move up the last inode in the current * chunk. The lookup_ge will always get * us the first inode in the next chunk. */ @@ -767,9 +640,8 @@ xfs_inumbers( continue; } } - if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, - &i)) || - i == 0) { + error = xfs_inobt_get_rec(cur, &r, &i); + if (error || i == 0) { xfs_buf_relse(agbp); agbp = NULL; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); @@ -778,24 +650,26 @@ xfs_inumbers( agino = 0; continue; } - agino = gino + XFS_INODES_PER_CHUNK - 1; - buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); - buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; - buffer[bufidx].xi_allocmask = ~gfree; + agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; + buffer[bufidx].xi_startino = + XFS_AGINO_TO_INO(mp, agno, r.ir_startino); + buffer[bufidx].xi_alloccount = + XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_allocmask = ~r.ir_free; bufidx++; left--; if (bufidx == bcount) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) { + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) { error = XFS_ERROR(EFAULT); break; } - ubuffer += bufidx; + ubuffer += written; *count += bufidx; bufidx = 0; } if (left) { - error = xfs_inobt_increment(cur, 0, &tmp); + error = xfs_btree_increment(cur, 0, &tmp); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); cur = NULL; @@ -812,15 +686,15 @@ xfs_inumbers( } if (!error) { if (bufidx) { - if (copy_to_user(ubuffer, buffer, - bufidx * sizeof(*buffer))) + long written; + if (formatter(ubuffer, buffer, bufidx, &written)) error = XFS_ERROR(EFAULT); else *count += bufidx; } *lastino = XFS_AGINO_TO_INO(mp, agno, agino); } - kmem_free(buffer, bcount * sizeof(*buffer)); + kmem_free(buffer); if (cur) xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR)); diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index be5f12e07d2..97295d91d17 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -27,24 +27,15 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dip, int *stat); /* * Values for stat return value. */ -#define BULKSTAT_RV_NOTHING 0 -#define BULKSTAT_RV_DIDONE 1 -#define BULKSTAT_RV_GIVEUP 2 - -/* - * Values for bulkstat flag argument. - */ -#define BULKSTAT_FG_IGET 0x1 /* Go through the buffer cache */ -#define BULKSTAT_FG_QUICK 0x2 /* No iget, walk the dinode cluster */ +#define BULKSTAT_RV_NOTHING 0 +#define BULKSTAT_RV_DIDONE 1 +#define BULKSTAT_RV_GIVEUP 2 /* * Return stat information in bulk (by-inode) for the filesystem. @@ -55,10 +46,8 @@ xfs_bulkstat( xfs_ino_t *lastino, /* last inode returned */ int *count, /* size of buffer/count returned */ bulkstat_one_pf formatter, /* func that'd fill a single buf */ - void *private_data, /* private data for formatter */ size_t statstruct_size,/* sizeof struct that we're filling */ char __user *ubuffer,/* buffer with inode stats */ - int flags, /* flag to control access method */ int *done); /* 1 if there are more stats to get */ int @@ -68,23 +57,50 @@ xfs_bulkstat_single( char __user *buffer, int *done); +typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ + void __user *ubuffer, /* buffer to write to */ + int ubsize, /* remaining user buffer sz */ + int *ubused, /* bytes used by formatter */ + const xfs_bstat_t *buffer); /* buffer to read from */ + +int +xfs_bulkstat_one_int( + xfs_mount_t *mp, + xfs_ino_t ino, + void __user *buffer, + int ubsize, + bulkstat_one_fmt_pf formatter, + int *ubused, + int *stat); + int xfs_bulkstat_one( xfs_mount_t *mp, xfs_ino_t ino, void __user *buffer, int ubsize, - void *private_data, - xfs_daddr_t bno, int *ubused, - void *dibuff, int *stat); +typedef int (*inumbers_fmt_pf)( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + +int +xfs_inumbers_fmt( + void __user *ubuffer, /* buffer to write to */ + const xfs_inogrp_t *buffer, /* buffer to read from */ + long count, /* # of elements to read */ + long *written); /* # of bytes written */ + int /* error status */ xfs_inumbers( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *last, /* last inode returned */ int *count, /* size of buffer/count returned */ - xfs_inogrp_t __user *buffer);/* buffer with inode info */ + void __user *buffer, /* buffer with inode info */ + inumbers_fmt_pf formatter); #endif /* __XFS_ITABLE_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/xfs_linux.h index a13f75c1a93..825249d2dfc 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -21,44 +21,64 @@ #include <linux/types.h> /* - * Some types are conditional depending on the target system. * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. - * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well - * as requiring XFS_BIG_BLKNOS to be set. + * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set. */ -#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64) +#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64) # define XFS_BIG_BLKNOS 1 -# if BITS_PER_LONG == 64 -# define XFS_BIG_INUMS 1 -# else -# define XFS_BIG_INUMS 0 -# endif +# define XFS_BIG_INUMS 1 #else # define XFS_BIG_BLKNOS 0 # define XFS_BIG_INUMS 0 #endif -#include <xfs_types.h> -#include <xfs_arch.h> +/* + * Kernel specific type declarations for XFS + */ +typedef signed char __int8_t; +typedef unsigned char __uint8_t; +typedef signed short int __int16_t; +typedef unsigned short int __uint16_t; +typedef signed int __int32_t; +typedef unsigned int __uint32_t; +typedef signed long long int __int64_t; +typedef unsigned long long int __uint64_t; + +typedef __uint32_t inst_t; /* an instruction */ + +typedef __s64 xfs_off_t; /* <file offset> type */ +typedef unsigned long long xfs_ino_t; /* <inode> type */ +typedef __s64 xfs_daddr_t; /* <disk address> type */ +typedef char * xfs_caddr_t; /* <core address> type */ +typedef __u32 xfs_dev_t; +typedef __u32 xfs_nlink_t; + +/* __psint_t is the same size as a pointer */ +#if (BITS_PER_LONG == 32) +typedef __int32_t __psint_t; +typedef __uint32_t __psunsigned_t; +#elif (BITS_PER_LONG == 64) +typedef __int64_t __psint_t; +typedef __uint64_t __psunsigned_t; +#else +#error BITS_PER_LONG must be 32 or 64 +#endif -#include <kmem.h> -#include <mrlock.h> -#include <spin.h> -#include <sv.h> -#include <mutex.h> -#include <sema.h> -#include <time.h> +#include "xfs_types.h" -#include <support/ktrace.h> -#include <support/debug.h> -#include <support/move.h> -#include <support/uuid.h> +#include "kmem.h" +#include "mrlock.h" +#include "time.h" +#include "uuid.h" +#include <linux/semaphore.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/crc32c.h> #include <linux/module.h> +#include <linux/mutex.h> #include <linux/file.h> #include <linux/swap.h> #include <linux/errno.h> @@ -75,6 +95,16 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/delay.h> +#include <linux/log2.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/ctype.h> +#include <linux/writeback.h> +#include <linux/capability.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/list_sort.h> +#include <linux/ratelimit.h> #include <asm/page.h> #include <asm/div64.h> @@ -83,43 +113,31 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_behavior.h> -#include <xfs_vfs.h> -#include <xfs_cred.h> -#include <xfs_vnode.h> -#include <xfs_stats.h> -#include <xfs_sysctl.h> -#include <xfs_iops.h> -#include <xfs_aops.h> -#include <xfs_super.h> -#include <xfs_globals.h> -#include <xfs_fs_subr.h> -#include <xfs_lrw.h> -#include <xfs_buf.h> +#include "xfs_vnode.h" +#include "xfs_stats.h" +#include "xfs_sysctl.h" +#include "xfs_iops.h" +#include "xfs_aops.h" +#include "xfs_super.h" +#include "xfs_cksum.h" +#include "xfs_buf.h" +#include "xfs_message.h" + +#ifdef __BIG_ENDIAN +#define XFS_NATIVE_HOST 1 +#else +#undef XFS_NATIVE_HOST +#endif /* * Feature macros (disable/enable) */ -#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ -#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ -#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else #undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #endif -/* - * State flag for unwritten extent buffers. - * - * We need to be able to distinguish between these and delayed - * allocate buffers within XFS. The generic IO path code does - * not need to distinguish - we use the BH_Delay flag for both - * delalloc and these ondisk-uninitialised buffers. - */ -BUFFER_FNS(PrivateStart, unwritten); - -#define restricted_chown xfs_params.restrict_chown.val #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val @@ -129,16 +147,14 @@ BUFFER_FNS(PrivateStart, unwritten); #define xfs_inherit_sync xfs_params.inherit_sync.val #define xfs_inherit_nodump xfs_params.inherit_nodump.val #define xfs_inherit_noatime xfs_params.inherit_noatim.val -#define xfs_buf_timer_centisecs xfs_params.xfs_buf_timer.val -#define xfs_buf_age_centisecs xfs_params.xfs_buf_age.val #define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val #define xfs_rotorstep xfs_params.rotorstep.val #define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val +#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val +#define xfs_eofb_secs xfs_params.eofb_timer.val #define current_cpu() (raw_smp_processor_id()) #define current_pid() (current->pid) -#define current_fsuid(cred) (current->fsuid) -#define current_fsgid(cred) (current->fsgid) #define current_test_flags(f) (current->flags & (f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) @@ -147,86 +163,65 @@ BUFFER_FNS(PrivateStart, unwritten); #define current_restore_flags_nested(sp, f) \ (current->flags = ((current->flags & ~(f)) | (*(sp) & (f)))) -#define NBPP PAGE_SIZE -#define DPPSHFT (PAGE_SHIFT - 9) -#define NDPP (1 << (PAGE_SHIFT - 9)) -#define dtop(DD) (((DD) + NDPP - 1) >> DPPSHFT) -#define dtopt(DD) ((DD) >> DPPSHFT) -#define dpoff(DD) ((DD) & (NDPP-1)) +#define spinlock_destroy(lock) #define NBBY 8 /* number of bits per byte */ -#define NBPC PAGE_SIZE /* Number of bytes per click */ -#define BPCSHIFT PAGE_SHIFT /* LOG2(NBPC) if exact */ /* * Size of block device i/o is parameterized here. * Currently the system supports page-sized i/o. */ -#define BLKDEV_IOSHIFT BPCSHIFT +#define BLKDEV_IOSHIFT PAGE_CACHE_SHIFT #define BLKDEV_IOSIZE (1<<BLKDEV_IOSHIFT) /* number of BB's per block device block */ #define BLKDEV_BB BTOBB(BLKDEV_IOSIZE) -/* bytes to clicks */ -#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) -#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) -#define btoc64(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) -#define btoct64(x) ((__uint64_t)(x)>>BPCSHIFT) -#define io_btoc(x) (((__psunsigned_t)(x)+(IO_NBPC-1))>>IO_BPCSHIFT) -#define io_btoct(x) ((__psunsigned_t)(x)>>IO_BPCSHIFT) - -/* off_t bytes to clicks */ -#define offtoc(x) (((__uint64_t)(x)+(NBPC-1))>>BPCSHIFT) -#define offtoct(x) ((xfs_off_t)(x)>>BPCSHIFT) - -/* clicks to off_t bytes */ -#define ctooff(x) ((xfs_off_t)(x)<<BPCSHIFT) - -/* clicks to bytes */ -#define ctob(x) ((__psunsigned_t)(x)<<BPCSHIFT) -#define btoct(x) ((__psunsigned_t)(x)>>BPCSHIFT) -#define ctob64(x) ((__uint64_t)(x)<<BPCSHIFT) -#define io_ctob(x) ((__psunsigned_t)(x)<<IO_BPCSHIFT) - -/* bytes to clicks */ -#define btoc(x) (((__psunsigned_t)(x)+(NBPC-1))>>BPCSHIFT) - #define ENOATTR ENODATA /* Attribute not found */ #define EWRONGFS EINVAL /* Mount with wrong filesystem type */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ +#define EFSBADCRC EBADMSG /* Bad CRC detected */ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) -/* - * IRIX (BSD) quotactl makes use of separate commands for user/group, - * whereas on Linux the syscall encodes this information into the cmd - * field (see the QCMD macro in quota.h). These macros help keep the - * code portable - they are not visible from the syscall interface. - */ -#define Q_XSETGQLIM XQM_CMD(8) /* set groups disk limits */ -#define Q_XGETGQUOTA XQM_CMD(9) /* get groups disk limits */ -#define Q_XSETPQLIM XQM_CMD(10) /* set projects disk limits */ -#define Q_XGETPQUOTA XQM_CMD(11) /* get projects disk limits */ - -#define dfltprid 0 +#define XFS_PROJID_DEFAULT 0 #define MAXPATHLEN 1024 #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +/* Kernel uid/gid conversion. These are used to convert to/from the on disk + * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. + * The conversion here is type only, the value will remain the same since we + * are converting to the init_user_ns. The uid is later mapped to a particular + * user namespace value when crossing the kernel/user boundary. + */ +static inline __uint32_t xfs_kuid_to_uid(kuid_t uid) +{ + return from_kuid(&init_user_ns, uid); +} + +static inline kuid_t xfs_uid_to_kuid(__uint32_t uid) +{ + return make_kuid(&init_user_ns, uid); +} + +static inline __uint32_t xfs_kgid_to_gid(kgid_t gid) +{ + return from_kgid(&init_user_ns, gid); +} + +static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) +{ + return make_kgid(&init_user_ns, gid); +} + /* * Various platform dependent calls that don't fit anywhere else */ #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() -#define xfs_itruncate_data(ip, off) \ - (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off))) -#define xfs_statvfs_fsid(statp, mp) \ - ({ u64 id = huge_encode_dev((mp)->m_ddev_targp->bt_dev); \ - __kernel_fsid_t *fsid = &(statp)->f_fsid; \ - (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); }) /* Move the kernel do_div definition off to one side */ @@ -339,4 +334,51 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) return(x * y); } +static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) +{ + x += y - 1; + do_div(x, y); + return x; +} + +/* ARM old ABI has some weird alignment/padding */ +#if defined(__arm__) && !defined(__ARM_EABI__) +#define __arch_pack __attribute__((packed)) +#else +#define __arch_pack +#endif + +#define ASSERT_ALWAYS(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifdef DEBUG +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC noinline +#endif + +#else /* !DEBUG */ + +#ifdef XFS_WARN + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#else /* !DEBUG && !XFS_WARN */ + +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#endif /* XFS_WARN */ +#endif /* DEBUG */ + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 21ac1a67e3e..292308dede6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -17,239 +17,473 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" #include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" #include "xfs_log_recover.h" -#include "xfs_trans_priv.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_rw.h" +#include "xfs_trace.h" +#include "xfs_fsops.h" +#include "xfs_cksum.h" - -#define xlog_write_adv_cnt(ptr, len, off, bytes) \ - { (ptr) += (bytes); \ - (len) -= (bytes); \ - (off) += (bytes);} +kmem_zone_t *xfs_log_ticket_zone; /* Local miscellaneous function prototypes */ -STATIC int xlog_bdstrat_cb(struct xfs_buf *); -STATIC int xlog_commit_record(xfs_mount_t *mp, xlog_ticket_t *ticket, - xlog_in_core_t **, xfs_lsn_t *); -STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks); -STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); -STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_dealloc_log(xlog_t *log); -STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], - int nentries, xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags); +STATIC int +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp); + +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks); +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head); +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_dealloc_log( + struct xlog *log); /* local state machine functions */ STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); -STATIC void xlog_state_do_callback(xlog_t *log,int aborted, xlog_in_core_t *iclog); -STATIC int xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclog, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp); -STATIC void xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic); -STATIC int xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog); -STATIC void xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size); -STATIC int xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed); -STATIC int xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed); -STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog); - -/* local functions to manipulate grant head */ -STATIC int xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *xtic); -STATIC void xlog_grant_push_ail(xfs_mount_t *mp, - int need_bytes); -STATIC void xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC int xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *ticket); -STATIC void xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket); - - -/* local ticket functions */ -STATIC void xlog_state_ticket_alloc(xlog_t *log); -STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, - int unit_bytes, - int count, - char clientid, - uint flags); -STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket); +STATIC void +xlog_state_do_callback( + struct xlog *log, + int aborted, + struct xlog_in_core *iclog); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclog, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp); +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog); +STATIC void +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size); +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog); + +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes); +STATIC void +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket); +STATIC void +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket); #if defined(DEBUG) -STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr); -STATIC void xlog_verify_grant_head(xlog_t *log, int equals); -STATIC void xlog_verify_iclog(xlog_t *log, xlog_in_core_t *iclog, - int count, boolean_t syncing); -STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn); +STATIC void +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr); +STATIC void +xlog_verify_grant_tail( + struct xlog *log); +STATIC void +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing); +STATIC void +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn); #else #define xlog_verify_dest_ptr(a,b) -#define xlog_verify_grant_head(a,b) +#define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c,d) #define xlog_verify_tail_lsn(a,b,c) #endif -STATIC int xlog_iclogs_empty(xlog_t *log); +STATIC int +xlog_iclogs_empty( + struct xlog *log); -#if defined(XFS_LOG_TRACE) -void -xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) +static void +xlog_grant_sub_space( + struct xlog *log, + atomic64_t *head, + int bytes) { - unsigned long cnts; + int64_t head_val = atomic64_read(head); + int64_t new, old; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } - /* ticket counts are 1 byte each */ - cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; - - ktrace_enter(log->l_grant_trace, - (void *)tic, - (void *)log->l_reserve_headq, - (void *)log->l_write_headq, - (void *)((unsigned long)log->l_grant_reserve_cycle), - (void *)((unsigned long)log->l_grant_reserve_bytes), - (void *)((unsigned long)log->l_grant_write_cycle), - (void *)((unsigned long)log->l_grant_write_bytes), - (void *)((unsigned long)log->l_curr_cycle), - (void *)((unsigned long)log->l_curr_block), - (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)), - (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)), - (void *)string, - (void *)((unsigned long)tic->t_trans_type), - (void *)cnts, - (void *)((unsigned long)tic->t_curr_res), - (void *)((unsigned long)tic->t_unit_res)); + do { + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + space -= bytes; + if (space < 0) { + space += log->l_logsize; + cycle--; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } -void -xlog_trace_iclog(xlog_in_core_t *iclog, uint state) -{ - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_SLEEP); - ktrace_enter(iclog->ic_trace, - (void *)((unsigned long)state), - (void *)((unsigned long)current_pid()), - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, - (void *)NULL, (void *)NULL); +static void +xlog_grant_add_space( + struct xlog *log, + atomic64_t *head, + int bytes) +{ + int64_t head_val = atomic64_read(head); + int64_t new, old; + + do { + int tmp; + int cycle, space; + + xlog_crack_grant_head_val(head_val, &cycle, &space); + + tmp = log->l_logsize - space; + if (tmp > bytes) + space += bytes; + else { + space = bytes - tmp; + cycle++; + } + + old = head_val; + new = xlog_assign_grant_head_val(cycle, space); + head_val = atomic64_cmpxchg(head, old, new); + } while (head_val != old); } -#else -#define xlog_trace_loggrant(log,tic,string) -#define xlog_trace_iclog(iclog,state) -#endif /* XFS_LOG_TRACE */ +STATIC void +xlog_grant_head_init( + struct xlog_grant_head *head) +{ + xlog_assign_grant_head(&head->grant, 1, 0); + INIT_LIST_HEAD(&head->waiters); + spin_lock_init(&head->lock); +} -static void -xlog_ins_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +STATIC void +xlog_grant_head_wake_all( + struct xlog_grant_head *head) { - if (*qp) { - tic->t_next = (*qp); - tic->t_prev = (*qp)->t_prev; - (*qp)->t_prev->t_next = tic; - (*qp)->t_prev = tic; - } else { - tic->t_prev = tic->t_next = tic; - *qp = tic; - } + struct xlog_ticket *tic; - tic->t_flags |= XLOG_TIC_IN_Q; + spin_lock(&head->lock); + list_for_each_entry(tic, &head->waiters, t_queue) + wake_up_process(tic->t_task); + spin_unlock(&head->lock); } -static void -xlog_del_ticketq(struct xlog_ticket **qp, struct xlog_ticket *tic) +static inline int +xlog_ticket_reservation( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic) { - if (tic == tic->t_next) { - *qp = NULL; + if (head == &log->l_write_head) { + ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + return tic->t_unit_res; } else { - *qp = tic->t_next; - tic->t_next->t_prev = tic->t_prev; - tic->t_prev->t_next = tic->t_next; + if (tic->t_flags & XLOG_TIC_PERM_RESERV) + return tic->t_unit_res * tic->t_cnt; + else + return tic->t_unit_res; } - - tic->t_next = tic->t_prev = NULL; - tic->t_flags &= ~XLOG_TIC_IN_Q; } -static void -xlog_grant_sub_space(struct log *log, int bytes) +STATIC bool +xlog_grant_head_wake( + struct xlog *log, + struct xlog_grant_head *head, + int *free_bytes) { - log->l_grant_write_bytes -= bytes; - if (log->l_grant_write_bytes < 0) { - log->l_grant_write_bytes += log->l_logsize; - log->l_grant_write_cycle--; + struct xlog_ticket *tic; + int need_bytes; + + list_for_each_entry(tic, &head->waiters, t_queue) { + need_bytes = xlog_ticket_reservation(log, head, tic); + if (*free_bytes < need_bytes) + return false; + + *free_bytes -= need_bytes; + trace_xfs_log_grant_wake_up(log, tic); + wake_up_process(tic->t_task); } - log->l_grant_reserve_bytes -= bytes; - if ((log)->l_grant_reserve_bytes < 0) { - log->l_grant_reserve_bytes += log->l_logsize; - log->l_grant_reserve_cycle--; + return true; +} + +STATIC int +xlog_grant_head_wait( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int need_bytes) __releases(&head->lock) + __acquires(&head->lock) +{ + list_add_tail(&tic->t_queue, &head->waiters); + + do { + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + xlog_grant_push_ail(log, need_bytes); + + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&head->lock); + + XFS_STATS_INC(xs_sleep_logspace); + + trace_xfs_log_grant_sleep(log, tic); + schedule(); + trace_xfs_log_grant_wake(log, tic); + + spin_lock(&head->lock); + if (XLOG_FORCED_SHUTDOWN(log)) + goto shutdown; + } while (xlog_space_left(log, &head->grant) < need_bytes); + + list_del_init(&tic->t_queue); + return 0; +shutdown: + list_del_init(&tic->t_queue); + return XFS_ERROR(EIO); +} + +/* + * Atomically get the log space required for a log ticket. + * + * Once a ticket gets put onto head->waiters, it will only return after the + * needed reservation is satisfied. + * + * This function is structured so that it has a lock free fast path. This is + * necessary because every new transaction reservation will come through this + * path. Hence any lock will be globally hot if we take it unconditionally on + * every pass. + * + * As tickets are only ever moved on and off head->waiters under head->lock, we + * only need to take that lock if we are going to add the ticket to the queue + * and sleep. We can avoid taking the lock if the ticket was never added to + * head->waiters because the t_queue list head will be empty and we hold the + * only reference to it so it can safely be checked unlocked. + */ +STATIC int +xlog_grant_head_check( + struct xlog *log, + struct xlog_grant_head *head, + struct xlog_ticket *tic, + int *need_bytes) +{ + int free_bytes; + int error = 0; + + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + + /* + * If there are other waiters on the queue then give them a chance at + * logspace before us. Wake up the first waiters, if we do not wake + * up all the waiters then go to sleep waiting for more free space, + * otherwise try to get some space for this transaction. + */ + *need_bytes = xlog_ticket_reservation(log, head, tic); + free_bytes = xlog_space_left(log, &head->grant); + if (!list_empty_careful(&head->waiters)) { + spin_lock(&head->lock); + if (!xlog_grant_head_wake(log, head, &free_bytes) || + free_bytes < *need_bytes) { + error = xlog_grant_head_wait(log, head, tic, + *need_bytes); + } + spin_unlock(&head->lock); + } else if (free_bytes < *need_bytes) { + spin_lock(&head->lock); + error = xlog_grant_head_wait(log, head, tic, *need_bytes); + spin_unlock(&head->lock); } + return error; } static void -xlog_grant_add_space_write(struct log *log, int bytes) +xlog_tic_reset_res(xlog_ticket_t *tic) { - log->l_grant_write_bytes += bytes; - if (log->l_grant_write_bytes > log->l_logsize) { - log->l_grant_write_bytes -= log->l_logsize; - log->l_grant_write_cycle++; - } + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; + tic->t_res_num_ophdrs = 0; } static void -xlog_grant_add_space_reserve(struct log *log, int bytes) +xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type) { - log->l_grant_reserve_bytes += bytes; - if (log->l_grant_reserve_bytes > log->l_logsize) { - log->l_grant_reserve_bytes -= log->l_logsize; - log->l_grant_reserve_cycle++; + if (tic->t_res_num == XLOG_TIC_LEN_MAX) { + /* add to overflow and start again */ + tic->t_res_o_flow += tic->t_res_arr_sum; + tic->t_res_num = 0; + tic->t_res_arr_sum = 0; } + + tic->t_res_arr[tic->t_res_num].r_len = len; + tic->t_res_arr[tic->t_res_num].r_type = type; + tic->t_res_arr_sum += len; + tic->t_res_num++; } -static inline void -xlog_grant_add_space(struct log *log, int bytes) +/* + * Replenish the byte reservation required by moving the grant write head. + */ +int +xfs_log_regrant( + struct xfs_mount *mp, + struct xlog_ticket *tic) { - xlog_grant_add_space_write(log, bytes); - xlog_grant_add_space_reserve(log, bytes); + struct xlog *log = mp->m_log; + int need_bytes; + int error = 0; + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + /* + * This is a new transaction on the ticket, so we need to change the + * transaction ID so that the next transaction has a different TID in + * the log. Just add one to the existing tid so that we can see chains + * of rolling transactions in the log easily. + */ + tic->t_tid++; + + xlog_grant_push_ail(log, tic->t_unit_res); + + tic->t_curr_res = tic->t_unit_res; + xlog_tic_reset_res(tic); + + if (tic->t_cnt > 0) + return 0; + + trace_xfs_log_regrant(log, tic); + + error = xlog_grant_head_check(log, &log->l_write_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_regrant_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; +} + +/* + * Reserve log space and return a ticket corresponding the reservation. + * + * Each reservation is going to reserve extra space for a log record header. + * When writes happen to the on-disk log, we don't subtract the length of the + * log record header from any reservation. By wasting space in each + * reservation, we prevent over allocation problems. + */ +int +xfs_log_reserve( + struct xfs_mount *mp, + int unit_bytes, + int cnt, + struct xlog_ticket **ticp, + __uint8_t client, + bool permanent, + uint t_type) +{ + struct xlog *log = mp->m_log; + struct xlog_ticket *tic; + int need_bytes; + int error = 0; + + ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); + + if (XLOG_FORCED_SHUTDOWN(log)) + return XFS_ERROR(EIO); + + XFS_STATS_INC(xs_try_logspace); + + ASSERT(*ticp == NULL); + tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, + KM_SLEEP | KM_MAYFAIL); + if (!tic) + return XFS_ERROR(ENOMEM); + + tic->t_trans_type = t_type; + *ticp = tic; + + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt + : tic->t_unit_res); + + trace_xfs_log_reserve(log, tic); + + error = xlog_grant_head_check(log, &log->l_reserve_head, tic, + &need_bytes); + if (error) + goto out_error; + + xlog_grant_add_space(log, &log->l_reserve_head.grant, need_bytes); + xlog_grant_add_space(log, &log->l_write_head.grant, need_bytes); + trace_xfs_log_reserve_exit(log, tic); + xlog_verify_grant_tail(log); + return 0; + +out_error: + /* + * If we are failing, make sure the ticket doesn't have any current + * reservations. We don't want to add this back when the ticket/ + * transaction gets cancelled. + */ + tic->t_curr_res = 0; + tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ + return error; } @@ -275,14 +509,14 @@ xlog_grant_add_space(struct log *log, int bytes) * out when the next write occurs. */ xfs_lsn_t -xfs_log_done(xfs_mount_t *mp, - xfs_log_ticket_t xtic, - void **iclog, - uint flags) +xfs_log_done( + struct xfs_mount *mp, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + uint flags) { - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; - xfs_lsn_t lsn = 0; + struct xlog *log = mp->m_log; + xfs_lsn_t lsn = 0; if (XLOG_FORCED_SHUTDOWN(log) || /* @@ -290,8 +524,7 @@ xfs_log_done(xfs_mount_t *mp, * If we get an error, just continue and give back the log ticket. */ (((ticket->t_flags & XLOG_TIC_INITED) == 0) && - (xlog_commit_record(mp, ticket, - (xlog_in_core_t **)iclog, &lsn)))) { + (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { flags |= XFS_LOG_REL_PERM_RESERV; @@ -301,67 +534,27 @@ xfs_log_done(xfs_mount_t *mp, if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || (flags & XFS_LOG_REL_PERM_RESERV)) { + trace_xfs_log_done_nonperm(log, ticket); + /* * Release ticket if not permanent reservation or a specific * request has been made to release a permanent reservation. */ - xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); xlog_ungrant_log_space(log, ticket); - xlog_state_put_ticket(log, ticket); + xfs_log_ticket_put(ticket); } else { - xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); - xlog_regrant_reserve_log_space(log, ticket); - } + trace_xfs_log_done_perm(log, ticket); - /* If this ticket was a permanent reservation and we aren't - * trying to release it, reset the inited flags; so next time - * we write, a start record will be written out. - */ - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) && - (flags & XFS_LOG_REL_PERM_RESERV) == 0) + xlog_regrant_reserve_log_space(log, ticket); + /* If this ticket was a permanent reservation and we aren't + * trying to release it, reset the inited flags; so next time + * we write, a start record will be written out. + */ ticket->t_flags |= XLOG_TIC_INITED; + } return lsn; -} /* xfs_log_done */ - - -/* - * Force the in-core log to disk. If flags == XFS_LOG_SYNC, - * the force is done synchronously. - * - * Asynchronous forces are implemented by setting the WANT_SYNC - * bit in the appropriate in-core log and then returning. - * - * Synchronous forces are implemented with a semaphore. All callers - * to force a given lsn to disk will wait on a semaphore attached to the - * specific in-core log. When given in-core log finally completes its - * write to disk, that thread will wake up all threads waiting on the - * semaphore. - */ -int -_xfs_log_force( - xfs_mount_t *mp, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) -{ - xlog_t *log = mp->m_log; - int dummy; - - if (!log_flushed) - log_flushed = &dummy; - - ASSERT(flags & XFS_LOG_FORCE); - - XFS_STATS_INC(xs_log_force); - - if (log->l_flags & XLOG_IO_ERROR) - return XFS_ERROR(EIO); - if (lsn == 0) - return xlog_state_sync_all(log, flags, log_flushed); - else - return xlog_state_sync(log, lsn, flags, log_flushed); -} /* xfs_log_force */ +} /* * Attaches a new iclog I/O completion callback routine during @@ -370,16 +563,14 @@ _xfs_log_force( * executing the callback at an appropriate time. */ int -xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ - void *iclog_hndl, /* iclog to hang callback off */ - xfs_log_callback_t *cb) +xfs_log_notify( + struct xfs_mount *mp, + struct xlog_in_core *iclog, + xfs_log_callback_t *cb) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - int abortflg, spl; + int abortflg; - cb->cb_next = NULL; - spl = LOG_LOCK(log); + spin_lock(&iclog->ic_callback_lock); abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); if (!abortflg) { ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || @@ -388,18 +579,16 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ *(iclog->ic_callback_tail) = cb; iclog->ic_callback_tail = &(cb->cb_next); } - LOG_UNLOCK(log, spl); + spin_unlock(&iclog->ic_callback_lock); return abortflg; -} /* xfs_log_notify */ +} int -xfs_log_release_iclog(xfs_mount_t *mp, - void *iclog_hndl) +xfs_log_release_iclog( + struct xfs_mount *mp, + struct xlog_in_core *iclog) { - xlog_t *log = mp->m_log; - xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; - - if (xlog_state_release_iclog(log, iclog)) { + if (xlog_state_release_iclog(mp->m_log, iclog)) { xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); return EIO; } @@ -408,63 +597,6 @@ xfs_log_release_iclog(xfs_mount_t *mp, } /* - * 1. Reserve an amount of on-disk log space and return a ticket corresponding - * to the reservation. - * 2. Potentially, push buffers at tail of log to disk. - * - * Each reservation is going to reserve extra space for a log record header. - * When writes happen to the on-disk log, we don't subtract the length of the - * log record header from any reservation. By wasting space in each - * reservation, we prevent over allocation problems. - */ -int -xfs_log_reserve(xfs_mount_t *mp, - int unit_bytes, - int cnt, - xfs_log_ticket_t *ticket, - __uint8_t client, - uint flags, - uint t_type) -{ - xlog_t *log = mp->m_log; - xlog_ticket_t *internal_ticket; - int retval = 0; - - ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - ASSERT((flags & XFS_LOG_NOSLEEP) == 0); - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); - - XFS_STATS_INC(xs_try_logspace); - - if (*ticket != NULL) { - ASSERT(flags & XFS_LOG_PERM_RESERV); - internal_ticket = (xlog_ticket_t *)*ticket; - xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)"); - xlog_grant_push_ail(mp, internal_ticket->t_unit_res); - retval = xlog_regrant_write_log_space(log, internal_ticket); - } else { - /* may sleep if need to allocate more tickets */ - internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, - client, flags); - internal_ticket->t_trans_type = t_type; - *ticket = internal_ticket; - xlog_trace_loggrant(log, internal_ticket, - (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ? - "xfs_log_reserve: create new ticket (permanent trans)" : - "xfs_log_reserve: create new ticket"); - xlog_grant_push_ail(mp, - (internal_ticket->t_unit_res * - internal_ticket->t_cnt)); - retval = xlog_grant_log_space(log, internal_ticket); - } - - return retval; -} /* xfs_log_reserve */ - - -/* * Mount a log filesystem * * mp - ubiquitous xfs mount point structure @@ -475,84 +607,150 @@ xfs_log_reserve(xfs_mount_t *mp, * Return error or zero. */ int -xfs_log_mount(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +xfs_log_mount( + xfs_mount_t *mp, + xfs_buftarg_t *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); - else { - cmn_err(CE_NOTE, - "!Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", - mp->m_fsname); - ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + int error = 0; + int min_logfsbs; + + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { + xfs_notice(mp, +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(mp->m_log)) { + error = -PTR_ERR(mp->m_log); + goto out; + } + + /* + * Validate the given log space and drop a critical message via syslog + * if the log size is too small that would lead to some unexpected + * situations in transaction log space reservation stage. + * + * Note: we can't just reject the mount if the validation fails. This + * would mean that people would have to downgrade their kernel just to + * remedy the situation as there is no way to grow the log (short of + * black magic surgery with xfs_db). + * + * We can, however, reject mounts for CRC format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); + + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); + error = EINVAL; + } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { + xfs_warn(mp, + "Log size %d blocks too large, maximum size is %lld blocks", + mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); + error = EINVAL; + } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { + xfs_warn(mp, + "log size %lld bytes too large, maximum size is %lld bytes", + XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), + XFS_MAX_LOG_BYTES); + error = EINVAL; + } + if (error) { + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); + goto out_free_log; + } + xfs_crit(mp, +"Log size out of supported range. Continuing onwards, but if log hangs are\n" +"experienced then please report this message in the bug report."); + } + + /* + * Initialize the AIL now we have a log. + */ + error = xfs_trans_ail_init(mp); + if (error) { + xfs_warn(mp, "AIL initialisation failed: error %d", error); + goto out_free_log; + } + mp->m_log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - bhv_vfs_t *vfsp = XFS_MTOVFS(mp); - int error, readonly = (vfsp->vfs_flag & VFS_RDONLY); + int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (readonly) - vfsp->vfs_flag &= ~VFS_RDONLY; + mp->m_flags &= ~XFS_MOUNT_RDONLY; error = xlog_recover(mp->m_log); if (readonly) - vfsp->vfs_flag |= VFS_RDONLY; + mp->m_flags |= XFS_MOUNT_RDONLY; if (error) { - cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); - xlog_dealloc_log(mp->m_log); - return error; + xfs_warn(mp, "log mount/recovery failed: error %d", + error); + goto out_destroy_ail; } } /* Normal transactions can now occur */ mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; - /* End mounting message in xfs_log_mount_finish */ + /* + * Now the log has been fully initialised and we know were our + * space grant counters are, we can initialise the permanent ticket + * needed for delayed logging to work. + */ + xlog_cil_init_post_recovery(mp->m_log); + return 0; -} /* xfs_log_mount */ + +out_destroy_ail: + xfs_trans_ail_destroy(mp); +out_free_log: + xlog_dealloc_log(mp->m_log); +out: + return error; +} /* - * Finish the recovery of the file system. This is separate from - * the xfs_log_mount() call, because it depends on the code in - * xfs_mountfs() to read in the root and real-time bitmap inodes - * between calling xfs_log_mount() and here. + * Finish the recovery of the file system. This is separate from the + * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read + * in the root and real-time bitmap inodes between calling xfs_log_mount() and + * here. * - * mp - ubiquitous xfs mount point structure + * If we finish recovery successfully, start the background log work. If we are + * not doing recovery, then we have a RO filesystem and we don't need to start + * it. */ int -xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags) +xfs_log_mount_finish(xfs_mount_t *mp) { - int error; + int error = 0; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - error = xlog_recover_finish(mp->m_log, mfsi_flags); - else { - error = 0; - ASSERT(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY); + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + error = xlog_recover_finish(mp->m_log); + if (!error) + xfs_log_work_queue(mp); + } else { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } - return error; -} -/* - * Unmount processing for the log. - */ -int -xfs_log_unmount(xfs_mount_t *mp) -{ - int error; - - error = xfs_log_unmount_write(mp); - xfs_log_unmount_dealloc(mp); return error; } @@ -568,39 +766,31 @@ xfs_log_unmount(xfs_mount_t *mp) * Unmount record used to have a string "Unmount filesystem--" in the * data section where the "Un" was really a magic number (XLOG_UNMOUNT_TYPE). * We just write the magic number now since that particular field isn't - * currently architecture converted and "nUmount" is a bit foo. + * currently architecture converted and "Unmount" is a bit foo. * As far as I know, there weren't any dependencies on the old behaviour. */ int xfs_log_unmount_write(xfs_mount_t *mp) { - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; xlog_in_core_t *iclog; #ifdef DEBUG xlog_in_core_t *first_iclog; #endif - xfs_log_iovec_t reg[1]; - xfs_log_ticket_t tic = NULL; + xlog_ticket_t *tic = NULL; xfs_lsn_t lsn; int error; - SPLDECL(s); - - /* the data section must be 32 bit size aligned */ - struct { - __uint16_t magic; - __uint16_t pad1; - __uint32_t pad2; /* may as well make it 64 bits */ - } magic = { XLOG_UNMOUNT_TYPE, 0, 0 }; /* * Don't write out unmount record on read-only mounts. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; - xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); #ifdef DEBUG first_iclog = iclog = log->l_iclog; @@ -613,15 +803,31 @@ xfs_log_unmount_write(xfs_mount_t *mp) } while (iclog != first_iclog); #endif if (! (XLOG_FORCED_SHUTDOWN(log))) { - reg[0].i_addr = (void*)&magic; - reg[0].i_len = sizeof(magic); - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_UNMOUNT); - - error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0); + error = xfs_log_reserve(mp, 600, 1, &tic, + XFS_LOG, 0, XLOG_UNMOUNT_REC_TYPE); if (!error) { - /* remove inited flag */ - ((xlog_ticket_t *)tic)->t_flags = 0; - error = xlog_write(mp, reg, 1, tic, &lsn, + /* the data section must be 32 bit size aligned */ + struct { + __uint16_t magic; + __uint16_t pad1; + __uint32_t pad2; /* may as well make it 64 bits */ + } magic = { + .magic = XLOG_UNMOUNT_TYPE, + }; + struct xfs_log_iovec reg = { + .i_addr = &magic, + .i_len = sizeof(magic), + .i_type = XLOG_REG_TYPE_UNMOUNT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + + /* remove inited flag, and account for space used */ + tic->t_flags = 0; + tic->t_curr_res -= sizeof(magic); + error = xlog_write(log, &vec, tic, &lsn, NULL, XLOG_UNMOUNT_TRANS); /* * At this point, we're umounting anyway, @@ -630,33 +836,34 @@ xfs_log_unmount_write(xfs_mount_t *mp) */ } - if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_log_unmount: unmount record failed"); - } + if (error) + xfs_alert(mp, "%s: unmount record failed", __func__); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; - LOG_UNLOCK(log, s); + atomic_inc(&iclog->ic_refcnt); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (!(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY)) { if (!XLOG_FORCED_SHUTDOWN(log)) { - sv_wait(&iclog->ic_forcesema, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); + } + if (tic) { + trace_xfs_log_umount_write(log, tic); + xlog_ungrant_log_space(log, tic); + xfs_log_ticket_put(tic); } - if (tic) - xlog_state_put_ticket(log, tic); } else { /* * We're already in forced_shutdown mode, couldn't @@ -671,207 +878,223 @@ xfs_log_unmount_write(xfs_mount_t *mp) * a file system that went into forced_shutdown as * the result of an unmount.. */ - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; - iclog->ic_refcnt++; - LOG_UNLOCK(log, s); + atomic_inc(&iclog->ic_refcnt); xlog_state_want_sync(log, iclog); - (void) xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if ( ! ( iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY || iclog->ic_state == XLOG_STATE_IOERROR) ) { - sv_wait(&iclog->ic_forcesema, PMEM, - &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, + &log->l_icloglock); } else { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } } - return 0; + return error; } /* xfs_log_unmount_write */ /* - * Deallocate log structures for unmount/relocation. + * Empty the log for unmount/freeze. + * + * To do this, we first need to shut down the background log work so it is not + * trying to cover the log as we clean up. We then need to unpin all objects in + * the log so we can then flush them out. Once they have completed their IO and + * run the callbacks removing themselves from the AIL, we can write the unmount + * record. */ void -xfs_log_unmount_dealloc(xfs_mount_t *mp) +xfs_log_quiesce( + struct xfs_mount *mp) { - xlog_dealloc_log(mp->m_log); + cancel_delayed_work_sync(&mp->m_log->l_work); + xfs_log_force(mp, XFS_LOG_SYNC); + + /* + * The superblock buffer is uncached and while xfs_ail_push_all_sync() + * will push it, xfs_wait_buftarg() will not wait for it. Further, + * xfs_buf_iowait() cannot be used because it was pushed with the + * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for + * the IO to complete. + */ + xfs_ail_push_all_sync(mp->m_ail); + xfs_wait_buftarg(mp->m_ddev_targp); + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_unlock(mp->m_sb_bp); + + xfs_log_unmount_write(mp); } /* - * Write region vectors to log. The write happens using the space reservation - * of the ticket (tic). It is not a requirement that all writes for a given - * transaction occur with one call to xfs_log_write(). + * Shut down and release the AIL and Log. + * + * During unmount, we need to ensure we flush all the dirty metadata objects + * from the AIL so that the log is empty before we write the unmount record to + * the log. Once this is done, we can tear down the AIL and the log. */ -int -xfs_log_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn) +void +xfs_log_unmount( + struct xfs_mount *mp) { - int error; - xlog_t *log = mp->m_log; - - if (XLOG_FORCED_SHUTDOWN(log)) - return XFS_ERROR(EIO); + xfs_log_quiesce(mp); - if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } - return error; -} /* xfs_log_write */ + xfs_trans_ail_destroy(mp); + xlog_dealloc_log(mp->m_log); +} +void +xfs_log_item_init( + struct xfs_mount *mp, + struct xfs_log_item *item, + int type, + const struct xfs_item_ops *ops) +{ + item->li_mountp = mp; + item->li_ailp = mp->m_ail; + item->li_type = type; + item->li_ops = ops; + item->li_lv = NULL; + + INIT_LIST_HEAD(&item->li_ail); + INIT_LIST_HEAD(&item->li_cil); +} +/* + * Wake up processes waiting for log space after we have moved the log tail. + */ void -xfs_log_move_tail(xfs_mount_t *mp, - xfs_lsn_t tail_lsn) +xfs_log_space_wake( + struct xfs_mount *mp) { - xlog_ticket_t *tic; - xlog_t *log = mp->m_log; - int need_bytes, free_bytes, cycle, bytes; - SPLDECL(s); + struct xlog *log = mp->m_log; + int free_bytes; if (XLOG_FORCED_SHUTDOWN(log)) return; - ASSERT(!XFS_FORCED_SHUTDOWN(mp)); - - if (tail_lsn == 0) { - /* needed since sync_lsn is 64 bits */ - s = LOG_LOCK(log); - tail_lsn = log->l_last_sync_lsn; - LOG_UNLOCK(log, s); - } - s = GRANT_LOCK(log); + if (!list_empty_careful(&log->l_write_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - /* Also an invalid lsn. 1 implies that we aren't passing in a valid - * tail_lsn. - */ - if (tail_lsn != 1) { - log->l_tail_lsn = tail_lsn; + spin_lock(&log->l_write_head.lock); + free_bytes = xlog_space_left(log, &log->l_write_head.grant); + xlog_grant_head_wake(log, &log->l_write_head, &free_bytes); + spin_unlock(&log->l_write_head.lock); } - if ((tic = log->l_write_headq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - cycle = log->l_grant_write_cycle; - bytes = log->l_grant_write_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { - ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV); + if (!list_empty_careful(&log->l_reserve_head.waiters)) { + ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); - if (free_bytes < tic->t_unit_res && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= tic->t_unit_res; - sv_signal(&tic->t_sema); - tic = tic->t_next; - } while (tic != log->l_write_headq); + spin_lock(&log->l_reserve_head.lock); + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + xlog_grant_head_wake(log, &log->l_reserve_head, &free_bytes); + spin_unlock(&log->l_reserve_head.lock); } - if ((tic = log->l_reserve_headq)) { -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("Recovery problem"); -#endif - cycle = log->l_grant_reserve_cycle; - bytes = log->l_grant_reserve_bytes; - free_bytes = xlog_space_left(log, cycle, bytes); - do { - if (tic->t_flags & XLOG_TIC_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_cnt; - else - need_bytes = tic->t_unit_res; - if (free_bytes < need_bytes && tail_lsn != 1) - break; - tail_lsn = 0; - free_bytes -= need_bytes; - sv_signal(&tic->t_sema); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); - } - GRANT_UNLOCK(log, s); -} /* xfs_log_move_tail */ +} /* - * Determine if we have a transaction that has gone to disk - * that needs to be covered. Log activity needs to be idle (no AIL and - * nothing in the iclogs). And, we need to be in the right state indicating - * something has gone out. + * Determine if we have a transaction that has gone to disk that needs to be + * covered. To begin the transition to the idle state firstly the log needs to + * be idle. That means the CIL, the AIL and the iclogs needs to be empty before + * we start attempting to cover the log. + * + * Only if we are then in a state where covering is needed, the caller is + * informed that dummy transactions are required to move the log into the idle + * state. + * + * If there are any items in the AIl or CIL, then we do not want to attempt to + * cover the log as we may be in a situation where there isn't log space + * available to run a dummy transaction and this can lead to deadlocks when the + * tail of the log is pinned by an item that is modified in the CIL. Hence + * there's no point in running a dummy transaction at this point because we + * can't start trying to idle the log until both the CIL and AIL are empty. */ int xfs_log_need_covered(xfs_mount_t *mp) { - SPLDECL(s); - int needed = 0, gen; - xlog_t *log = mp->m_log; - bhv_vfs_t *vfsp = XFS_MTOVFS(mp); + struct xlog *log = mp->m_log; + int needed = 0; + + if (!xfs_fs_writable(mp)) + return 0; - if (vfs_test_for_freeze(vfsp) || XFS_FORCED_SHUTDOWN(mp) || - (vfsp->vfs_flag & VFS_RDONLY)) + if (!xlog_cil_empty(log)) return 0; - s = LOG_LOCK(log); - if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || - (log->l_covered_state == XLOG_STATE_COVER_NEED2)) - && !xfs_trans_first_ail(mp, &gen) - && xlog_iclogs_empty(log)) { + spin_lock(&log->l_icloglock); + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (xfs_ail_min_lsn(log->l_ailp)) + break; + if (!xlog_iclogs_empty(log)) + break; + + needed = 1; if (log->l_covered_state == XLOG_STATE_COVER_NEED) log->l_covered_state = XLOG_STATE_COVER_DONE; - else { - ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); + else log->l_covered_state = XLOG_STATE_COVER_DONE2; - } + break; + default: needed = 1; + break; } - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return needed; } -/****************************************************************************** - * - * local routines - * - ****************************************************************************** - */ - -/* xfs_trans_tail_ail returns 0 when there is nothing in the list. - * The log manager must keep track of the last LR which was committed - * to disk. The lsn of this LR will become the new tail_lsn whenever - * xfs_trans_tail_ail returns 0. If we don't do this, we run into - * the situation where stuff could be written into the log but nothing - * was ever in the AIL when asked. Eventually, we panic since the - * tail hits the head. - * +/* * We may be holding the log iclog lock upon entering this routine. */ xfs_lsn_t -xlog_assign_tail_lsn(xfs_mount_t *mp) +xlog_assign_tail_lsn_locked( + struct xfs_mount *mp) { - xfs_lsn_t tail_lsn; - SPLDECL(s); - xlog_t *log = mp->m_log; + struct xlog *log = mp->m_log; + struct xfs_log_item *lip; + xfs_lsn_t tail_lsn; - tail_lsn = xfs_trans_tail_ail(mp); - s = GRANT_LOCK(log); - if (tail_lsn != 0) { - log->l_tail_lsn = tail_lsn; - } else { - tail_lsn = log->l_tail_lsn = log->l_last_sync_lsn; - } - GRANT_UNLOCK(log, s); + assert_spin_locked(&mp->m_ail->xa_lock); + /* + * To make sure we always have a valid LSN for the log tail we keep + * track of the last LSN which was committed in log->l_last_sync_lsn, + * and use that when the AIL was empty. + */ + lip = xfs_ail_min(mp->m_ail); + if (lip) + tail_lsn = lip->li_lsn; + else + tail_lsn = atomic64_read(&log->l_last_sync_lsn); + trace_xfs_log_assign_tail_lsn(log, tail_lsn); + atomic64_set(&log->l_tail_lsn, tail_lsn); return tail_lsn; -} /* xlog_assign_tail_lsn */ +} + +xfs_lsn_t +xlog_assign_tail_lsn( + struct xfs_mount *mp) +{ + xfs_lsn_t tail_lsn; + spin_lock(&mp->m_ail->xa_lock); + tail_lsn = xlog_assign_tail_lsn_locked(mp); + spin_unlock(&mp->m_ail->xa_lock); + + return tail_lsn; +} /* * Return the space in the log between the tail and the head. The head @@ -887,38 +1110,43 @@ xlog_assign_tail_lsn(xfs_mount_t *mp) * the tail. The details of this case are described below, but the end * result is that we return the size of the log as the amount of space left. */ -int -xlog_space_left(xlog_t *log, int cycle, int bytes) +STATIC int +xlog_space_left( + struct xlog *log, + atomic64_t *head) { - int free_bytes; - int tail_bytes; - int tail_cycle; - - tail_bytes = BBTOB(BLOCK_LSN(log->l_tail_lsn)); - tail_cycle = CYCLE_LSN(log->l_tail_lsn); - if ((tail_cycle == cycle) && (bytes >= tail_bytes)) { - free_bytes = log->l_logsize - (bytes - tail_bytes); - } else if ((tail_cycle + 1) < cycle) { + int free_bytes; + int tail_bytes; + int tail_cycle; + int head_cycle; + int head_bytes; + + xlog_crack_grant_head(head, &head_cycle, &head_bytes); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); + tail_bytes = BBTOB(tail_bytes); + if (tail_cycle == head_cycle && head_bytes >= tail_bytes) + free_bytes = log->l_logsize - (head_bytes - tail_bytes); + else if (tail_cycle + 1 < head_cycle) return 0; - } else if (tail_cycle < cycle) { - ASSERT(tail_cycle == (cycle - 1)); - free_bytes = tail_bytes - bytes; + else if (tail_cycle < head_cycle) { + ASSERT(tail_cycle == (head_cycle - 1)); + free_bytes = tail_bytes - head_bytes; } else { /* * The reservation head is behind the tail. * In this case we just want to return the size of the * log as the amount of space left. */ - xfs_fs_cmn_err(CE_ALERT, log->l_mp, + xfs_alert(log->l_mp, "xlog_space_left: head behind tail\n" " tail_cycle = %d, tail_bytes = %d\n" " GH cycle = %d, GH bytes = %d", - tail_cycle, tail_bytes, cycle, bytes); + tail_cycle, tail_bytes, head_cycle, head_bytes); ASSERT(0); free_bytes = log->l_logsize; } return free_bytes; -} /* xlog_space_left */ +} /* @@ -930,29 +1158,17 @@ xlog_space_left(xlog_t *log, int cycle, int bytes) void xlog_iodone(xfs_buf_t *bp) { - xlog_in_core_t *iclog; - xlog_t *l; - int aborted; - - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - aborted = 0; - - /* - * Some versions of cpp barf on the recursive definition of - * ic_log -> hic_fields.ic_log and expand ic_log twice when - * it is passed through two macros. Workaround broken cpp. - */ - l = iclog->ic_log; + struct xlog_in_core *iclog = bp->b_fspriv; + struct xlog *l = iclog->ic_log; + int aborted = 0; /* * Race to shutdown the filesystem if we see an error. */ - if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { - xfs_ioerror_alert("xlog_iodone", l->l_mp, bp, XFS_BUF_ADDR(bp)); - XFS_BUF_STALE(bp); + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); /* * This flag will be propagated to the trans-committed @@ -963,78 +1179,41 @@ xlog_iodone(xfs_buf_t *bp) } else if (iclog->ic_state & XLOG_STATE_IOERROR) { aborted = XFS_LI_ABORTED; } - xlog_state_done_syncing(iclog, aborted); - if (!(XFS_BUF_ISASYNC(bp))) { - /* - * Corresponding psema() will be done in bwrite(). If we don't - * vsema() here, panic. - */ - XFS_BUF_V_IODONESEMA(bp); - } -} /* xlog_iodone */ - -/* - * The bdstrat callback function for log bufs. This gives us a central - * place to trap bufs in case we get hit by a log I/O error and need to - * shutdown. Actually, in practice, even when we didn't get a log error, - * we transition the iclogs to IOERROR state *after* flushing all existing - * iclogs to disk. This is because we don't want anymore new transactions to be - * started or completed afterwards. - */ -STATIC int -xlog_bdstrat_cb(struct xfs_buf *bp) -{ - xlog_in_core_t *iclog; - - iclog = XFS_BUF_FSPRIVATE(bp, xlog_in_core_t *); - - if ((iclog->ic_state & XLOG_STATE_IOERROR) == 0) { - /* note for irix bstrat will need struct bdevsw passed - * Fix the following macro if the code ever is merged - */ - XFS_bdstrat(bp); - return 0; - } - - xfs_buftrace("XLOG__BDSTRAT IOERROR", bp); - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_STALE(bp); - xfs_biodone(bp); - return XFS_ERROR(EIO); + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); + xlog_state_done_syncing(iclog, aborted); + /* + * drop the buffer lock now that we are done. Nothing references + * the buffer after this, so an unmount waiting on this lock can now + * tear it down safely. As such, it is unsafe to reference the buffer + * (bp) after the unlock as we could race with it being freed. + */ + xfs_buf_unlock(bp); } /* * Return size of each in-core log record buffer. * - * Low memory machines only get 2 16KB buffers. We don't want to waste - * memory here. However, all other machines get at least 2 32KB buffers. - * The number is hard coded because we don't care about the minimum - * memory size, just 32MB systems. + * All machines get 8 x 32kB buffers by default, unless tuned otherwise. * * If the filesystem blocksize is too large, we may need to choose a * larger size since the directory code currently logs entire blocks. */ STATIC void -xlog_get_iclog_buffer_size(xfs_mount_t *mp, - xlog_t *log) +xlog_get_iclog_buffer_size( + struct xfs_mount *mp, + struct xlog *log) { int size; int xhdrs; - if (mp->m_logbufs <= 0) { - if (xfs_physmem <= btoc(128*1024*1024)) { - log->l_iclog_bufs = XLOG_MIN_ICLOGS; - } else if (xfs_physmem <= btoc(400*1024*1024)) { - log->l_iclog_bufs = XLOG_MED_ICLOGS; - } else { /* 256K with 32K bufs */ - log->l_iclog_bufs = XLOG_MAX_ICLOGS; - } - } else { + if (mp->m_logbufs <= 0) + log->l_iclog_bufs = XLOG_MAX_ICLOGS; + else log->l_iclog_bufs = mp->m_logbufs; - } /* * Buffer size passed in from mount system call. @@ -1047,9 +1226,9 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, size >>= 1; } - if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { - /* # headers = size / 32K - * one header holds cycles from 32K of data + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + /* # headers = size / 32k + * one header holds cycles from 32k of data */ xhdrs = mp->m_logbsize / XLOG_HEADER_CYCLE_SIZE; @@ -1065,49 +1244,16 @@ xlog_get_iclog_buffer_size(xfs_mount_t *mp, goto done; } - /* - * Special case machines that have less than 32MB of memory. - * All machines with more memory use 32KB buffers. - */ - if (xfs_physmem <= btoc(32*1024*1024)) { - /* Don't change; min configuration */ - log->l_iclog_size = XLOG_RECORD_BSIZE; /* 16k */ - log->l_iclog_size_log = XLOG_RECORD_BSHIFT; - } else { - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; /* 32k */ - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - } + /* All machines use 32kB buffers by default. */ + log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; + log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; /* the default log size is 16k or 32k which is one header sector */ log->l_iclog_hsize = BBSIZE; log->l_iclog_heads = 1; - /* - * For 16KB, we use 3 32KB buffers. For 32KB block sizes, we use - * 4 32KB buffers. For 64KB block sizes, we use 8 32KB buffers. - */ - if (mp->m_sb.sb_blocksize >= 16*1024) { - log->l_iclog_size = XLOG_BIG_RECORD_BSIZE; - log->l_iclog_size_log = XLOG_BIG_RECORD_BSHIFT; - if (mp->m_logbufs <= 0) { - switch (mp->m_sb.sb_blocksize) { - case 16*1024: /* 16 KB */ - log->l_iclog_bufs = 3; - break; - case 32*1024: /* 32 KB */ - log->l_iclog_bufs = 4; - break; - case 64*1024: /* 64 KB */ - log->l_iclog_bufs = 8; - break; - default: - xlog_panic("XFS: Invalid blocksize"); - break; - } - } - } - -done: /* are we being asked to make the sizes selected above visible? */ +done: + /* are we being asked to make the sizes selected above visible? */ if (mp->m_logbufs == 0) mp->m_logbufs = log->l_iclog_bufs; if (mp->m_logbsize == 0) @@ -1115,26 +1261,66 @@ done: /* are we being asked to make the sizes selected above visible? */ } /* xlog_get_iclog_buffer_size */ +void +xfs_log_work_queue( + struct xfs_mount *mp) +{ + queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); +} + +/* + * Every sync period we need to unpin all items in the AIL and push them to + * disk. If there is nothing dirty, then we might need to cover the log to + * indicate that the filesystem is idle. + */ +void +xfs_log_worker( + struct work_struct *work) +{ + struct xlog *log = container_of(to_delayed_work(work), + struct xlog, l_work); + struct xfs_mount *mp = log->l_mp; + + /* dgc: errors ignored - not fatal and nowhere to report them */ + if (xfs_log_need_covered(mp)) + xfs_fs_log_dummy(mp); + else + xfs_log_force(mp, 0); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); + + /* queue us up again */ + xfs_log_work_queue(mp); +} + /* * This routine initializes some of the log structure for a given mount point. * Its primary purpose is to fill in enough, so recovery can occur. However, * some other stuff may be filled in too. */ -STATIC xlog_t * -xlog_alloc_log(xfs_mount_t *mp, - xfs_buftarg_t *log_target, - xfs_daddr_t blk_offset, - int num_bblks) +STATIC struct xlog * +xlog_alloc_log( + struct xfs_mount *mp, + struct xfs_buftarg *log_target, + xfs_daddr_t blk_offset, + int num_bblks) { - xlog_t *log; + struct xlog *log; xlog_rec_header_t *head; xlog_in_core_t **iclogp; xlog_in_core_t *iclog, *prev_iclog=NULL; xfs_buf_t *bp; int i; - int iclogsize; + int error = ENOMEM; + uint log2_size = 0; - log = (xlog_t *)kmem_zalloc(sizeof(xlog_t), KM_SLEEP); + log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); + if (!log) { + xfs_warn(mp, "Log allocation failed: No memory!"); + goto out; + } log->l_mp = mp; log->l_targ = log_target; @@ -1143,43 +1329,64 @@ xlog_alloc_log(xfs_mount_t *mp, log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; log->l_flags |= XLOG_ACTIVE_RECOVERY; + INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; - ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, 1, 0); /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */ - log->l_last_sync_lsn = log->l_tail_lsn; + xlog_assign_atomic_lsn(&log->l_tail_lsn, 1, 0); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - log->l_grant_reserve_cycle = 1; - log->l_grant_write_cycle = 1; - if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) { - log->l_sectbb_log = mp->m_sb.sb_logsectlog - BBSHIFT; - ASSERT(log->l_sectbb_log <= mp->m_sectbb_log); + xlog_grant_head_init(&log->l_reserve_head); + xlog_grant_head_init(&log->l_write_head); + + error = EFSCORRUPTED; + if (xfs_sb_version_hassector(&mp->m_sb)) { + log2_size = mp->m_sb.sb_logsectlog; + if (log2_size < BBSHIFT) { + xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", + log2_size, BBSHIFT); + goto out_free_log; + } + + log2_size -= BBSHIFT; + if (log2_size > mp->m_sectbb_log) { + xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)", + log2_size, mp->m_sectbb_log); + goto out_free_log; + } + /* for larger sector sizes, must have v2 or external log */ - ASSERT(log->l_sectbb_log == 0 || - log->l_logBBstart == 0 || - XFS_SB_VERSION_HASLOGV2(&mp->m_sb)); - ASSERT(mp->m_sb.sb_logsectlog >= BBSHIFT); + if (log2_size && log->l_logBBstart > 0 && + !xfs_sb_version_haslogv2(&mp->m_sb)) { + xfs_warn(mp, + "log sector size (0x%x) invalid for configuration.", + log2_size); + goto out_free_log; + } } - log->l_sectbb_mask = (1 << log->l_sectbb_log) - 1; + log->l_sectBBsize = 1 << log2_size; xlog_get_iclog_buffer_size(mp, log); - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - log->l_xbuf = bp; + error = ENOMEM; + bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_log; + + /* + * The iclogbuf buffer locks are held over IO but we are not going to do + * IO yet. Hence unlock the buffer so that the log IO path can grab it + * when appropriately. + */ + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); - spinlock_init(&log->l_icloglock, "iclog"); - spinlock_init(&log->l_grant_lock, "grhead_iclog"); - initnsema(&log->l_flushsema, 0, "ic-flush"); - xlog_state_ticket_alloc(log); /* wait until after icloglock inited */ + bp->b_iodone = xlog_iodone; + log->l_xbuf = bp; - /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ - ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); + spin_lock_init(&log->l_icloglock); + init_waitqueue_head(&log->l_flush_wait); iclogp = &log->l_iclog; /* @@ -1189,52 +1396,74 @@ xlog_alloc_log(xfs_mount_t *mp, * with different amounts of memory. See the definition of * xlog_in_core_t in xfs_log_priv.h for details. */ - iclogsize = log->l_iclog_size; ASSERT(log->l_iclog_size >= 4096); for (i=0; i < log->l_iclog_bufs; i++) { - *iclogp = (xlog_in_core_t *) - kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); - iclog = *iclogp; - iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP); + *iclogp = kmem_zalloc(sizeof(xlog_in_core_t), KM_MAYFAIL); + if (!*iclogp) + goto out_free_iclog; + iclog = *iclogp; iclog->ic_prev = prev_iclog; prev_iclog = iclog; - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + bp = xfs_buf_get_uncached(mp->m_logdev_targp, + BTOBB(log->l_iclog_size), 0); + if (!bp) + goto out_free_iclog; + + ASSERT(xfs_buf_islocked(bp)); + xfs_buf_unlock(bp); + + bp->b_iodone = xlog_iodone; + iclog->ic_bp = bp; + iclog->ic_data = bp->b_addr; +#ifdef DEBUG + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); +#endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); - INT_SET(head->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); - INT_SET(head->h_version, ARCH_CONVERT, - XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); - INT_SET(head->h_size, ARCH_CONVERT, log->l_iclog_size); + head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + head->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + head->h_size = cpu_to_be32(log->l_iclog_size); /* new fields */ - INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); + head->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - iclog->ic_bp = bp; - - iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; + iclog->ic_size = BBTOB(bp->b_length) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_log = log; + atomic_set(&iclog->ic_refcnt, 0); + spin_lock_init(&iclog->ic_callback_lock); iclog->ic_callback_tail = &(iclog->ic_callback); - iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; + iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize; - ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp)); - ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0); - sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force"); - sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write"); + init_waitqueue_head(&iclog->ic_force_wait); + init_waitqueue_head(&iclog->ic_write_wait); iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ + error = xlog_cil_init(log); + if (error) + goto out_free_iclog; return log; + +out_free_iclog: + for (iclog = log->l_iclog; iclog; iclog = prev_iclog) { + prev_iclog = iclog->ic_next; + if (iclog->ic_bp) + xfs_buf_free(iclog->ic_bp); + kmem_free(iclog); + } + spinlock_destroy(&log->l_icloglock); + xfs_buf_free(log->l_xbuf); +out_free_log: + kmem_free(log); +out: + return ERR_PTR(-error); } /* xlog_alloc_log */ @@ -1243,26 +1472,31 @@ xlog_alloc_log(xfs_mount_t *mp, * ticket. Return the lsn of the commit record. */ STATIC int -xlog_commit_record(xfs_mount_t *mp, - xlog_ticket_t *ticket, - xlog_in_core_t **iclog, - xfs_lsn_t *commitlsnp) +xlog_commit_record( + struct xlog *log, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, + xfs_lsn_t *commitlsnp) { - int error; - xfs_log_iovec_t reg[1]; - - reg[0].i_addr = NULL; - reg[0].i_len = 0; - XLOG_VEC_SET_TYPE(®[0], XLOG_REG_TYPE_COMMIT); + struct xfs_mount *mp = log->l_mp; + int error; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; ASSERT_ALWAYS(iclog); - if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp, - iclog, XLOG_COMMIT_TRANS))) { + error = xlog_write(log, &vec, ticket, commitlsnp, iclog, + XLOG_COMMIT_TRANS); + if (error) xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - } return error; -} /* xlog_commit_record */ - +} /* * Push on the buffer cache code if we ever use more than 75% of the on-disk @@ -1271,65 +1505,177 @@ xlog_commit_record(xfs_mount_t *mp, * pushes on an lsn which is further along in the log once we reach the high * water mark. In this manner, we would be creating a low water mark. */ -void -xlog_grant_push_ail(xfs_mount_t *mp, - int need_bytes) -{ - xlog_t *log = mp->m_log; /* pointer to the log */ - xfs_lsn_t tail_lsn; /* lsn of the log tail */ - xfs_lsn_t threshold_lsn = 0; /* lsn we'd like to be at */ - int free_blocks; /* free blocks left to write to */ - int free_bytes; /* free bytes left to write to */ - int threshold_block; /* block in lsn we'd like to be at */ - int threshold_cycle; /* lsn cycle we'd like to be at */ - int free_threshold; - SPLDECL(s); - - ASSERT(BTOBB(need_bytes) < log->l_logBBsize); - - s = GRANT_LOCK(log); - free_bytes = xlog_space_left(log, - log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); - tail_lsn = log->l_tail_lsn; - free_blocks = BTOBBT(free_bytes); - - /* - * Set the threshold for the minimum number of free blocks in the - * log to the maximum of what the caller needs, one quarter of the - * log, and 256 blocks. - */ - free_threshold = BTOBB(need_bytes); - free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); - free_threshold = MAX(free_threshold, 256); - if (free_blocks < free_threshold) { - threshold_block = BLOCK_LSN(tail_lsn) + free_threshold; - threshold_cycle = CYCLE_LSN(tail_lsn); +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn = 0; + xfs_lsn_t last_sync_lsn; + int free_blocks; + int free_bytes; + int threshold_block; + int threshold_cycle; + int free_threshold; + + ASSERT(BTOBB(need_bytes) < log->l_logBBsize); + + free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); + free_blocks = BTOBBT(free_bytes); + + /* + * Set the threshold for the minimum number of free blocks in the + * log to the maximum of what the caller needs, one quarter of the + * log, and 256 blocks. + */ + free_threshold = BTOBB(need_bytes); + free_threshold = MAX(free_threshold, (log->l_logBBsize >> 2)); + free_threshold = MAX(free_threshold, 256); + if (free_blocks >= free_threshold) + return; + + xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, + &threshold_block); + threshold_block += free_threshold; if (threshold_block >= log->l_logBBsize) { - threshold_block -= log->l_logBBsize; - threshold_cycle += 1; + threshold_block -= log->l_logBBsize; + threshold_cycle += 1; } - ASSIGN_ANY_LSN_HOST(threshold_lsn, threshold_cycle, - threshold_block); + threshold_lsn = xlog_assign_lsn(threshold_cycle, + threshold_block); + /* + * Don't pass in an lsn greater than the lsn of the last + * log record known to be on disk. Use a snapshot of the last sync lsn + * so that it doesn't change between the compare and the set. + */ + last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) + threshold_lsn = last_sync_lsn; - /* Don't pass in an lsn greater than the lsn of the last - * log record known to be on disk. + /* + * Get the transaction layer to kick the dirty buffers out to + * disk asynchronously. No point in trying to do this if + * the filesystem is shutting down. */ - if (XFS_LSN_CMP(threshold_lsn, log->l_last_sync_lsn) > 0) - threshold_lsn = log->l_last_sync_lsn; - } - GRANT_UNLOCK(log, s); + if (!XLOG_FORCED_SHUTDOWN(log)) + xfs_ail_push(log->l_ailp, threshold_lsn); +} - /* - * Get the transaction layer to kick the dirty buffers out to - * disk asynchronously. No point in trying to do this if - * the filesystem is shutting down. - */ - if (threshold_lsn && - !XLOG_FORCED_SHUTDOWN(log)) - xfs_trans_push_ail(mp, threshold_lsn); -} /* xlog_grant_push_ail */ +/* + * Stamp cycle number in every block + */ +STATIC void +xlog_pack_data( + struct xlog *log, + struct xlog_in_core *iclog, + int roundoff) +{ + int i, j, k; + int size = iclog->ic_offset + roundoff; + __be32 cycle_lsn; + xfs_caddr_t dp; + + cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); + + dp = iclog->ic_datap; + for (i = 0; i < BTOBB(size); i++) { + if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) + break; + iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = iclog->ic_data; + + for ( ; i < BTOBB(size); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; + *(__be32 *)dp = cycle_lsn; + dp += BBSIZE; + } + + for (i = 1; i < log->l_iclog_heads; i++) + xhdr[i].hic_xheader.xh_cycle = cycle_lsn; + } +} + +/* + * Calculate the checksum for a log buffer. + * + * This is a little more complicated than it should be because the various + * headers and the actual data are non-contiguous. + */ +__le32 +xlog_cksum( + struct xlog *log, + struct xlog_rec_header *rhead, + char *dp, + int size) +{ + __uint32_t crc; + + /* first generate the crc for the record header ... */ + crc = xfs_start_cksum((char *)rhead, + sizeof(struct xlog_rec_header), + offsetof(struct xlog_rec_header, h_crc)); + + /* ... then for additional cycle data for v2 logs ... */ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; + int i; + + for (i = 1; i < log->l_iclog_heads; i++) { + crc = crc32c(crc, &xhdr[i].hic_xheader, + sizeof(struct xlog_rec_ext_header)); + } + } + + /* ... and finally for the payload */ + crc = crc32c(crc, dp, size); + + return xfs_end_cksum(crc); +} + +/* + * The bdstrat callback function for log bufs. This gives us a central + * place to trap bufs in case we get hit by a log I/O error and need to + * shutdown. Actually, in practice, even when we didn't get a log error, + * we transition the iclogs to IOERROR state *after* flushing all existing + * iclogs to disk. This is because we don't want anymore new transactions to be + * started or completed afterwards. + * + * We lock the iclogbufs here so that we can serialise against IO completion + * during unmount. We might be processing a shutdown triggered during unmount, + * and that can occur asynchronously to the unmount thread, and hence we need to + * ensure that completes before tearing down the iclogbufs. Hence we need to + * hold the buffer lock across the log IO to acheive that. + */ +STATIC int +xlog_bdstrat( + struct xfs_buf *bp) +{ + struct xlog_in_core *iclog = bp->b_fspriv; + + xfs_buf_lock(bp); + if (iclog->ic_state & XLOG_STATE_IOERROR) { + xfs_buf_ioerror(bp, EIO); + xfs_buf_stale(bp); + xfs_buf_ioend(bp, 0); + /* + * It would seem logical to return EIO here, but we rely on + * the log state machine to propagate I/O errors instead of + * doing it here. Similarly, IO completion will unlock the + * buffer, so we don't do it here. + */ + return 0; + } + + xfs_buf_iorequest(bp); + return 0; +} /* * Flush out the in-core log (iclog) to the on-disk log in an asynchronous @@ -1356,23 +1702,23 @@ xlog_grant_push_ail(xfs_mount_t *mp, * is added immediately before calling bwrite(). */ -int -xlog_sync(xlog_t *log, - xlog_in_core_t *iclog) +STATIC int +xlog_sync( + struct xlog *log, + struct xlog_in_core *iclog) { - xfs_caddr_t dptr; /* pointer to byte sized element */ xfs_buf_t *bp; - int i, ops; + int i; uint count; /* byte count of bwrite */ uint count_init; /* initial count before roundup */ int roundoff; /* roundoff to BB or stripe */ int split = 0; /* split write into two regions */ int error; - SPLDECL(s); - int v2 = XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb); + int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); + int size; XFS_STATS_INC(xs_log_writes); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ count_init = log->l_iclog_hsize + iclog->ic_offset; @@ -1393,57 +1739,83 @@ xlog_sync(xlog_t *log, roundoff < BBTOB(1))); /* move grant heads by roundoff in sync */ - s = GRANT_LOCK(log); - xlog_grant_add_space(log, roundoff); - GRANT_UNLOCK(log, s); + xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff); + xlog_grant_add_space(log, &log->l_write_head.grant, roundoff); /* put cycle number in every block */ xlog_pack_data(log, iclog, roundoff); /* real byte length */ - if (v2) { - INT_SET(iclog->ic_header.h_len, - ARCH_CONVERT, - iclog->ic_offset + roundoff); - } else { - INT_SET(iclog->ic_header.h_len, ARCH_CONVERT, iclog->ic_offset); - } - - /* put ops count in correct order */ - ops = iclog->ic_header.h_num_logops; - INT_SET(iclog->ic_header.h_num_logops, ARCH_CONVERT, ops); + size = iclog->ic_offset; + if (v2) + size += roundoff; + iclog->ic_header.h_len = cpu_to_be32(size); bp = iclog->ic_bp; - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long)1); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); - XFS_BUF_SET_ADDR(bp, BLOCK_LSN(INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT))); + XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { + char *dptr; + split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp))); count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)); - iclog->ic_bwritecnt = 2; /* split into 2 writes */ + iclog->ic_bwritecnt = 2; + + /* + * Bump the cycle numbers at the start of each block in the + * part of the iclog that ends up in the buffer that gets + * written to the start of the log. + * + * Watch out for the header magic number case, though. + */ + dptr = (char *)&iclog->ic_header + count; + for (i = 0; i < split; i += BBSIZE) { + __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr); + if (++cycle == XLOG_HEADER_MAGIC_NUM) + cycle++; + *(__be32 *)dptr = cpu_to_be32(cycle); + + dptr += BBSIZE; + } } else { iclog->ic_bwritecnt = 1; } - XFS_BUF_SET_PTR(bp, (xfs_caddr_t) &(iclog->ic_header), count); - XFS_BUF_SET_FSPRIVATE(bp, iclog); /* save for later */ + + /* calculcate the checksum */ + iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, + iclog->ic_datap, size); + + bp->b_io_length = BTOBB(count); + bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); - /* - * Do an ordered write for the log block. - * Its unnecessary to flush the first split block in the log wrap case. - */ - if (!split && (log->l_mp->m_flags & XFS_MOUNT_BARRIER)) - XFS_BUF_ORDERED(bp); + bp->b_flags |= XBF_SYNCIO; + + if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { + bp->b_flags |= XBF_FUA; + + /* + * Flush the data device before flushing the log to make + * sure all meta data written back from the AIL actually made + * it to disk before stamping the new log tail LSN into the + * log buffer. For an external log we need to issue the + * flush explicitly, and unfortunately synchronously here; + * for an internal log we can simply use the block layer + * state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; + } ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - xlog_verify_iclog(log, iclog, count, B_TRUE); + xlog_verify_iclog(log, iclog, count, true); /* account for log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); @@ -1453,38 +1825,22 @@ xlog_sync(xlog_t *log, */ XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { - xfs_ioerror_alert("xlog_sync", log->l_mp, bp, - XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync"); return error; } if (split) { bp = iclog->ic_log->l_xbuf; - ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == - (unsigned long)1); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)2); XFS_BUF_SET_ADDR(bp, 0); /* logical 0 */ - XFS_BUF_SET_PTR(bp, (xfs_caddr_t)((__psint_t)&(iclog->ic_header)+ - (__psint_t)count), split); - XFS_BUF_SET_FSPRIVATE(bp, iclog); + xfs_buf_associate_memory(bp, + (char *)&iclog->ic_header + count, split); + bp->b_fspriv = iclog; XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); XFS_BUF_ASYNC(bp); + bp->b_flags |= XBF_SYNCIO; if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - XFS_BUF_ORDERED(bp); - dptr = XFS_BUF_PTR(bp); - /* - * Bump the cycle numbers at the start of each block - * since this part of the buffer is at the start of - * a new cycle. Watch out for the header magic number - * case, though. - */ - for (i=0; i<split; i += BBSIZE) { - INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); - if (INT_GET(*(uint *)dptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) - INT_MOD(*(uint *)dptr, ARCH_CONVERT, +1); - dptr += BBSIZE; - } + bp->b_flags |= XBF_FUA; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1492,73 +1848,59 @@ xlog_sync(xlog_t *log, /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); - if ((error = XFS_bwrite(bp))) { - xfs_ioerror_alert("xlog_sync (split)", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + error = xlog_bdstrat(bp); + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_sync (split)"); return error; } } return 0; } /* xlog_sync */ - /* * Deallocate a log structure */ -void -xlog_dealloc_log(xlog_t *log) +STATIC void +xlog_dealloc_log( + struct xlog *log) { xlog_in_core_t *iclog, *next_iclog; - xlog_ticket_t *tic, *next_tic; int i; + xlog_cil_destroy(log); + /* + * Cycle all the iclogbuf locks to make sure all log IO completion + * is done before we tear down these buffers. + */ iclog = log->l_iclog; - for (i=0; i<log->l_iclog_bufs; i++) { - sv_destroy(&iclog->ic_forcesema); - sv_destroy(&iclog->ic_writesema); + for (i = 0; i < log->l_iclog_bufs; i++) { + xfs_buf_lock(iclog->ic_bp); + xfs_buf_unlock(iclog->ic_bp); + iclog = iclog->ic_next; + } + + /* + * Always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. Also, cycle the lock + * first to ensure we've completed IO on it. + */ + xfs_buf_lock(log->l_xbuf); + xfs_buf_unlock(log->l_xbuf); + xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size)); + xfs_buf_free(log->l_xbuf); + + iclog = log->l_iclog; + for (i = 0; i < log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif next_iclog = iclog->ic_next; - kmem_free(iclog->hic_data, log->l_iclog_size); - kmem_free(iclog, sizeof(xlog_in_core_t)); + kmem_free(iclog); iclog = next_iclog; } - freesema(&log->l_flushsema); spinlock_destroy(&log->l_icloglock); - spinlock_destroy(&log->l_grant_lock); - - /* XXXsup take a look at this again. */ - if ((log->l_ticket_cnt != log->l_ticket_tcnt) && - !XLOG_FORCED_SHUTDOWN(log)) { - xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_dealloc_log: (cnt: %d, total: %d)", - log->l_ticket_cnt, log->l_ticket_tcnt); - /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ - } else { - tic = log->l_unmount_free; - while (tic) { - next_tic = tic->t_next; - kmem_free(tic, NBPP); - tic = next_tic; - } - } - xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif log->l_mp->m_log = NULL; - kmem_free(log, sizeof(xlog_t)); + kmem_free(log); } /* xlog_dealloc_log */ /* @@ -1566,19 +1908,18 @@ xlog_dealloc_log(xlog_t *log) */ /* ARGSUSED */ static inline void -xlog_state_finish_copy(xlog_t *log, - xlog_in_core_t *iclog, - int record_cnt, - int copy_bytes) +xlog_state_finish_copy( + struct xlog *log, + struct xlog_in_core *iclog, + int record_cnt, + int copy_bytes) { - SPLDECL(s); + spin_lock(&log->l_icloglock); - s = LOG_LOCK(log); - - iclog->ic_header.h_num_logops += record_cnt; + be32_add_cpu(&iclog->ic_header.h_num_logops, record_cnt); iclog->ic_offset += copy_bytes; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } /* xlog_state_finish_copy */ @@ -1588,8 +1929,10 @@ xlog_state_finish_copy(xlog_t *log, * print out info relating to regions written which consume * the reservation */ -STATIC void -xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) +void +xlog_print_tic_res( + struct xfs_mount *mp, + struct xlog_ticket *ticket) { uint i; uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); @@ -1659,36 +2002,228 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) "SWAPEXT" }; - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_log_write: reservation summary:\n" - " trans type = %s (%u)\n" - " unit res = %d bytes\n" - " current res = %d bytes\n" - " total reg = %u bytes (o/flow = %u bytes)\n" - " ophdrs = %u (ophdr space = %u bytes)\n" - " ophdr + reg = %u bytes\n" - " num regions = %u\n", - ((ticket->t_trans_type <= 0 || - ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? - "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), - ticket->t_trans_type, - ticket->t_unit_res, - ticket->t_curr_res, - ticket->t_res_arr_sum, ticket->t_res_o_flow, - ticket->t_res_num_ophdrs, ophdr_spc, - ticket->t_res_arr_sum + - ticket->t_res_o_flow + ophdr_spc, - ticket->t_res_num); + xfs_warn(mp, + "xlog_write: reservation summary:\n" + " trans type = %s (%u)\n" + " unit res = %d bytes\n" + " current res = %d bytes\n" + " total reg = %u bytes (o/flow = %u bytes)\n" + " ophdrs = %u (ophdr space = %u bytes)\n" + " ophdr + reg = %u bytes\n" + " num regions = %u\n", + ((ticket->t_trans_type <= 0 || + ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? + "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), + ticket->t_trans_type, + ticket->t_unit_res, + ticket->t_curr_res, + ticket->t_res_arr_sum, ticket->t_res_o_flow, + ticket->t_res_num_ophdrs, ophdr_spc, + ticket->t_res_arr_sum + + ticket->t_res_o_flow + ophdr_spc, + ticket->t_res_num); for (i = 0; i < ticket->t_res_num; i++) { - uint r_type = ticket->t_res_arr[i].r_type; - cmn_err(CE_WARN, - "region[%u]: %s - %u bytes\n", - i, + uint r_type = ticket->t_res_arr[i].r_type; + xfs_warn(mp, "region[%u]: %s - %u bytes", i, ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? "bad-rtype" : res_type_str[r_type-1]), ticket->t_res_arr[i].r_len); } + + xfs_alert_tag(mp, XFS_PTAG_LOGRES, + "xlog_write: reservation ran out. Need to up reservation"); + xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); +} + +/* + * Calculate the potential space needed by the log vector. Each region gets + * its own xlog_op_header_t and may need to be double word aligned. + */ +static int +xlog_write_calc_vec_length( + struct xlog_ticket *ticket, + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + int headers = 0; + int len = 0; + int i; + + /* acct for start rec of xact */ + if (ticket->t_flags & XLOG_TIC_INITED) + headers++; + + for (lv = log_vector; lv; lv = lv->lv_next) { + /* we don't write ordered log vectors */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) + continue; + + headers += lv->lv_niovecs; + + for (i = 0; i < lv->lv_niovecs; i++) { + struct xfs_log_iovec *vecp = &lv->lv_iovecp[i]; + + len += vecp->i_len; + xlog_tic_add_region(ticket, vecp->i_len, vecp->i_type); + } + } + + ticket->t_res_num_ophdrs += headers; + len += headers * sizeof(struct xlog_op_header); + + return len; +} + +/* + * If first write for transaction, insert start record We can't be trying to + * commit if we are inited. We can't have any "partial_copy" if we are inited. + */ +static int +xlog_write_start_rec( + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket) +{ + if (!(ticket->t_flags & XLOG_TIC_INITED)) + return 0; + + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_len = 0; + ophdr->oh_flags = XLOG_START_TRANS; + ophdr->oh_res2 = 0; + + ticket->t_flags &= ~XLOG_TIC_INITED; + + return sizeof(struct xlog_op_header); +} + +static xlog_op_header_t * +xlog_write_setup_ophdr( + struct xlog *log, + struct xlog_op_header *ophdr, + struct xlog_ticket *ticket, + uint flags) +{ + ophdr->oh_tid = cpu_to_be32(ticket->t_tid); + ophdr->oh_clientid = ticket->t_clientid; + ophdr->oh_res2 = 0; + + /* are we copying a commit or unmount record? */ + ophdr->oh_flags = flags; + + /* + * We've seen logs corrupted with bad transaction client ids. This + * makes sure that XFS doesn't generate them on. Turn this into an EIO + * and shut down the filesystem. + */ + switch (ophdr->oh_clientid) { + case XFS_TRANSACTION: + case XFS_VOLUME: + case XFS_LOG: + break; + default: + xfs_warn(log->l_mp, + "Bad XFS transaction clientid 0x%x in ticket 0x%p", + ophdr->oh_clientid, ticket); + return NULL; + } + + return ophdr; +} + +/* + * Set up the parameters of the region copy into the log. This has + * to handle region write split across multiple log buffers - this + * state is kept external to this function so that this code can + * be written in an obvious, self documenting manner. + */ +static int +xlog_write_setup_copy( + struct xlog_ticket *ticket, + struct xlog_op_header *ophdr, + int space_available, + int space_required, + int *copy_off, + int *copy_len, + int *last_was_partial_copy, + int *bytes_consumed) +{ + int still_to_copy; + + still_to_copy = space_required - *bytes_consumed; + *copy_off = *bytes_consumed; + + if (still_to_copy <= space_available) { + /* write of region completes here */ + *copy_len = still_to_copy; + ophdr->oh_len = cpu_to_be32(*copy_len); + if (*last_was_partial_copy) + ophdr->oh_flags |= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); + *last_was_partial_copy = 0; + *bytes_consumed = 0; + return 0; + } + + /* partial write of region, needs extra log op header reservation */ + *copy_len = space_available; + ophdr->oh_len = cpu_to_be32(*copy_len); + ophdr->oh_flags |= XLOG_CONTINUE_TRANS; + if (*last_was_partial_copy) + ophdr->oh_flags |= XLOG_WAS_CONT_TRANS; + *bytes_consumed += *copy_len; + (*last_was_partial_copy)++; + + /* account for new log op header */ + ticket->t_curr_res -= sizeof(struct xlog_op_header); + ticket->t_res_num_ophdrs++; + + return sizeof(struct xlog_op_header); +} + +static int +xlog_write_copy_finish( + struct xlog *log, + struct xlog_in_core *iclog, + uint flags, + int *record_cnt, + int *data_cnt, + int *partial_copy, + int *partial_copy_len, + int log_offset, + struct xlog_in_core **commit_iclog) +{ + if (*partial_copy) { + /* + * This iclog has already been marked WANT_SYNC by + * xlog_state_get_iclog_space. + */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + return xlog_state_release_iclog(log, iclog); + } + + *partial_copy = 0; + *partial_copy_len = 0; + + if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { + /* no more space in this iclog - push it. */ + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + + spin_lock(&log->l_icloglock); + xlog_state_want_sync(log, iclog); + spin_unlock(&log->l_icloglock); + + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); + ASSERT(flags & XLOG_COMMIT_TRANS); + *commit_iclog = iclog; + } + + return 0; } /* @@ -1732,207 +2267,170 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) * bytes have been written out. */ int -xlog_write(xfs_mount_t * mp, - xfs_log_iovec_t reg[], - int nentries, - xfs_log_ticket_t tic, - xfs_lsn_t *start_lsn, - xlog_in_core_t **commit_iclog, - uint flags) -{ - xlog_t *log = mp->m_log; - xlog_ticket_t *ticket = (xlog_ticket_t *)tic; - xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ - xlog_op_header_t *logop_head; /* ptr to log operation header */ - __psint_t ptr; /* copy address into data region */ - int len; /* # xlog_write() bytes 2 still copy */ - int index; /* region index currently copying */ - int log_offset; /* offset (from 0) into data region */ - int start_rec_copy; /* # bytes to copy for start record */ - int partial_copy; /* did we split a region? */ - int partial_copy_len;/* # bytes copied if split region */ - int need_copy; /* # bytes need to memcpy this region */ - int copy_len; /* # bytes actually memcpy'ing */ - int copy_off; /* # bytes from entry start */ - int contwr; /* continued write of in-core log? */ - int error; - int record_cnt = 0, data_cnt = 0; - - partial_copy_len = partial_copy = 0; - - /* Calculate potential maximum space. Each region gets its own - * xlog_op_header_t and may need to be double word aligned. - */ - len = 0; - if (ticket->t_flags & XLOG_TIC_INITED) { /* acct for start rec of xact */ - len += sizeof(xlog_op_header_t); - XLOG_TIC_ADD_OPHDR(ticket); - } +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags) +{ + struct xlog_in_core *iclog = NULL; + struct xfs_log_iovec *vecp; + struct xfs_log_vec *lv; + int len; + int index; + int partial_copy = 0; + int partial_copy_len = 0; + int contwr = 0; + int record_cnt = 0; + int data_cnt = 0; + int error; + + *start_lsn = 0; + + len = xlog_write_calc_vec_length(ticket, log_vector); - for (index = 0; index < nentries; index++) { - len += sizeof(xlog_op_header_t); /* each region gets >= 1 */ - XLOG_TIC_ADD_OPHDR(ticket); - len += reg[index].i_len; - XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type); - } - contwr = *start_lsn = 0; + /* + * Region headers and bytes are already accounted for. + * We only need to take into account start records and + * split regions in this function. + */ + if (ticket->t_flags & XLOG_TIC_INITED) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - if (ticket->t_curr_res < len) { - xlog_print_tic_res(mp, ticket); -#ifdef DEBUG - xlog_panic( - "xfs_log_write: reservation ran out. Need to up reservation"); -#else - /* Customer configurable panic */ - xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, - "xfs_log_write: reservation ran out. Need to up reservation"); - /* If we did not panic, shutdown the filesystem */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); -#endif - } else - ticket->t_curr_res -= len; + /* + * Commit record headers need to be accounted for. These + * come in as separate writes so are easy to detect. + */ + if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS)) + ticket->t_curr_res -= sizeof(xlog_op_header_t); - for (index = 0; index < nentries; ) { - if ((error = xlog_state_get_iclog_space(log, len, &iclog, ticket, - &contwr, &log_offset))) - return error; + if (ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, ticket); - ASSERT(log_offset <= iclog->ic_size - 1); - ptr = (__psint_t) ((char *)iclog->ic_datap+log_offset); + index = 0; + lv = log_vector; + vecp = lv->lv_iovecp; + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + void *ptr; + int log_offset; - /* start_lsn is the first lsn written to. That's all we need. */ - if (! *start_lsn) - *start_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + error = xlog_state_get_iclog_space(log, len, &iclog, ticket, + &contwr, &log_offset); + if (error) + return error; - /* This loop writes out as many regions as can fit in the amount - * of space which was allocated by xlog_state_get_iclog_space(). - */ - while (index < nentries) { - ASSERT(reg[index].i_len % sizeof(__int32_t) == 0); - ASSERT((__psint_t)ptr % sizeof(__int32_t) == 0); - start_rec_copy = 0; - - /* If first write for transaction, insert start record. - * We can't be trying to commit if we are inited. We can't - * have any "partial_copy" if we are inited. - */ - if (ticket->t_flags & XLOG_TIC_INITED) { - logop_head = (xlog_op_header_t *)ptr; - INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_len = 0; - logop_head->oh_flags = XLOG_START_TRANS; - logop_head->oh_res2 = 0; - ticket->t_flags &= ~XLOG_TIC_INITED; /* clear bit */ - record_cnt++; - - start_rec_copy = sizeof(xlog_op_header_t); - xlog_write_adv_cnt(ptr, len, log_offset, start_rec_copy); - } + ASSERT(log_offset <= iclog->ic_size - 1); + ptr = iclog->ic_datap + log_offset; - /* Copy log operation header directly into data section */ - logop_head = (xlog_op_header_t *)ptr; - INT_SET(logop_head->oh_tid, ARCH_CONVERT, ticket->t_tid); - logop_head->oh_clientid = ticket->t_clientid; - logop_head->oh_res2 = 0; + /* start_lsn is the first lsn written to. That's all we need. */ + if (!*start_lsn) + *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); - /* header copied directly */ - xlog_write_adv_cnt(ptr, len, log_offset, sizeof(xlog_op_header_t)); + /* + * This loop writes out as many regions as can fit in the amount + * of space which was allocated by xlog_state_get_iclog_space(). + */ + while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { + struct xfs_log_iovec *reg; + struct xlog_op_header *ophdr; + int start_rec_copy; + int copy_len; + int copy_off; + bool ordered = false; + + /* ordered log vectors have no regions to write */ + if (lv->lv_buf_len == XFS_LOG_VEC_ORDERED) { + ASSERT(lv->lv_niovecs == 0); + ordered = true; + goto next_lv; + } - /* are we copying a commit or unmount record? */ - logop_head->oh_flags = flags; + reg = &vecp[index]; + ASSERT(reg->i_len % sizeof(__int32_t) == 0); + ASSERT((unsigned long)ptr % sizeof(__int32_t) == 0); - /* - * We've seen logs corrupted with bad transaction client - * ids. This makes sure that XFS doesn't generate them on. - * Turn this into an EIO and shut down the filesystem. - */ - switch (logop_head->oh_clientid) { - case XFS_TRANSACTION: - case XFS_VOLUME: - case XFS_LOG: - break; - default: - xfs_fs_cmn_err(CE_WARN, mp, - "Bad XFS transaction clientid 0x%x in ticket 0x%p", - logop_head->oh_clientid, tic); - return XFS_ERROR(EIO); - } + start_rec_copy = xlog_write_start_rec(ptr, ticket); + if (start_rec_copy) { + record_cnt++; + xlog_write_adv_cnt(&ptr, &len, &log_offset, + start_rec_copy); + } - /* Partial write last time? => (partial_copy != 0) - * need_copy is the amount we'd like to copy if everything could - * fit in the current memcpy. - */ - need_copy = reg[index].i_len - partial_copy_len; - - copy_off = partial_copy_len; - if (need_copy <= iclog->ic_size - log_offset) { /*complete write */ - INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len = need_copy); - if (partial_copy) - logop_head->oh_flags|= (XLOG_END_TRANS|XLOG_WAS_CONT_TRANS); - partial_copy_len = partial_copy = 0; - } else { /* partial write */ - copy_len = iclog->ic_size - log_offset; - INT_SET(logop_head->oh_len, ARCH_CONVERT, copy_len); - logop_head->oh_flags |= XLOG_CONTINUE_TRANS; - if (partial_copy) - logop_head->oh_flags |= XLOG_WAS_CONT_TRANS; - partial_copy_len += copy_len; - partial_copy++; - len += sizeof(xlog_op_header_t); /* from splitting of region */ - /* account for new log op header */ - ticket->t_curr_res -= sizeof(xlog_op_header_t); - XLOG_TIC_ADD_OPHDR(ticket); - } - xlog_verify_dest_ptr(log, ptr); - - /* copy region */ - ASSERT(copy_len >= 0); - memcpy((xfs_caddr_t)ptr, reg[index].i_addr + copy_off, copy_len); - xlog_write_adv_cnt(ptr, len, log_offset, copy_len); - - /* make copy_len total bytes copied, including headers */ - copy_len += start_rec_copy + sizeof(xlog_op_header_t); - record_cnt++; - data_cnt += contwr ? copy_len : 0; - if (partial_copy) { /* copied partial region */ - /* already marked WANT_SYNC by xlog_state_get_iclog_space */ - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - if ((error = xlog_state_release_iclog(log, iclog))) - return error; - break; /* don't increment index */ - } else { /* copied entire region */ - index++; - partial_copy_len = partial_copy = 0; - - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - record_cnt = data_cnt = 0; - xlog_state_want_sync(log, iclog); - if (commit_iclog) { - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else if ((error = xlog_state_release_iclog(log, iclog))) - return error; - if (index == nentries) - return 0; /* we are done */ - else - break; + ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); + if (!ophdr) + return XFS_ERROR(EIO); + + xlog_write_adv_cnt(&ptr, &len, &log_offset, + sizeof(struct xlog_op_header)); + + len += xlog_write_setup_copy(ticket, ophdr, + iclog->ic_size-log_offset, + reg->i_len, + ©_off, ©_len, + &partial_copy, + &partial_copy_len); + xlog_verify_dest_ptr(log, ptr); + + /* copy region */ + ASSERT(copy_len >= 0); + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); + + copy_len += start_rec_copy + sizeof(xlog_op_header_t); + record_cnt++; + data_cnt += contwr ? copy_len : 0; + + error = xlog_write_copy_finish(log, iclog, flags, + &record_cnt, &data_cnt, + &partial_copy, + &partial_copy_len, + log_offset, + commit_iclog); + if (error) + return error; + + /* + * if we had a partial copy, we need to get more iclog + * space but we don't want to increment the region + * index because there is still more is this region to + * write. + * + * If we completed writing this region, and we flushed + * the iclog (indicated by resetting of the record + * count), then we also need to get more log space. If + * this was the last record, though, we are done and + * can just return. + */ + if (partial_copy) + break; + + if (++index == lv->lv_niovecs) { +next_lv: + lv = lv->lv_next; + index = 0; + if (lv) + vecp = lv->lv_iovecp; + } + if (record_cnt == 0 && ordered == false) { + if (!lv) + return 0; + break; + } } - } /* if (partial_copy) */ - } /* while (index < nentries) */ - } /* for (index = 0; index < nentries; ) */ - ASSERT(len == 0); + } + + ASSERT(len == 0); + + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); + if (!commit_iclog) + return xlog_state_release_iclog(log, iclog); - xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { ASSERT(flags & XLOG_COMMIT_TRANS); *commit_iclog = iclog; return 0; - } - return xlog_state_release_iclog(log, iclog); -} /* xlog_write */ +} /***************************************************************************** @@ -1945,13 +2443,14 @@ xlog_write(xfs_mount_t * mp, /* Clean iclogs starting from the head. This ordering must be * maintained, so an iclog doesn't become ACTIVE beyond one that * is SYNCING. This is also required to maintain the notion that we use - * a counting semaphore to hold off would be writers to the log when every + * a ordered wait queue to hold off would be writers to the log when every * iclog is trying to sync to disk. * * State Change: DIRTY -> ACTIVE */ STATIC void -xlog_state_clean_log(xlog_t *log) +xlog_state_clean_log( + struct xlog *log) { xlog_in_core_t *iclog; int changed = 0; @@ -1961,7 +2460,7 @@ xlog_state_clean_log(xlog_t *log) if (iclog->ic_state == XLOG_STATE_DIRTY) { iclog->ic_state = XLOG_STATE_ACTIVE; iclog->ic_offset = 0; - iclog->ic_callback = NULL; /* don't need to free */ + ASSERT(iclog->ic_callback == NULL); /* * If the number of ops in this iclog indicate it just * contains the dummy transaction, we can @@ -1971,7 +2470,8 @@ xlog_state_clean_log(xlog_t *log) * We don't need to cover the dummy. */ if (!changed && - (INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT) == XLOG_COVER_OPS)) { + (be32_to_cpu(iclog->ic_header.h_num_logops) == + XLOG_COVER_OPS)) { changed = 1; } else { /* @@ -2030,7 +2530,7 @@ xlog_state_clean_log(xlog_t *log) STATIC xfs_lsn_t xlog_get_lowest_lsn( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *lsn_log; xfs_lsn_t lowest_lsn, lsn; @@ -2039,7 +2539,7 @@ xlog_get_lowest_lsn( lowest_lsn = 0; do { if (!(lsn_log->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_DIRTY))) { - lsn = INT_GET(lsn_log->ic_header.h_lsn, ARCH_CONVERT); + lsn = be64_to_cpu(lsn_log->ic_header.h_lsn); if ((lsn && !lowest_lsn) || (XFS_LSN_CMP(lsn, lowest_lsn) < 0)) { lowest_lsn = lsn; @@ -2053,9 +2553,9 @@ xlog_get_lowest_lsn( STATIC void xlog_state_do_callback( - xlog_t *log, - int aborted, - xlog_in_core_t *ciclog) + struct xlog *log, + int aborted, + struct xlog_in_core *ciclog) { xlog_in_core_t *iclog; xlog_in_core_t *first_iclog; /* used to know when we've @@ -2068,9 +2568,9 @@ xlog_state_do_callback( int funcdidcallbacks; /* flag: function did callbacks */ int repeats; /* for issuing console warnings if * looping too many times */ - SPLDECL(s); + int wake = 0; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); first_iclog = iclog = log->l_iclog; ioerrors = 0; funcdidcallbacks = 0; @@ -2115,7 +2615,7 @@ xlog_state_do_callback( * to DO_CALLBACK, we will not process it when * we retry since a previous iclog is in the * CALLBACK and the state cannot change since - * we are holding the LOG_LOCK. + * we are holding the l_icloglock. */ if (!(iclog->ic_state & (XLOG_STATE_DONE_SYNC | @@ -2141,11 +2641,9 @@ xlog_state_do_callback( */ lowest_lsn = xlog_get_lowest_lsn(log); - if (lowest_lsn && ( - XFS_LSN_CMP( - lowest_lsn, - INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) - )<0)) { + if (lowest_lsn && + XFS_LSN_CMP(lowest_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)) < 0) { iclog = iclog->ic_next; continue; /* Leave this iclog for * another thread */ @@ -2153,51 +2651,64 @@ xlog_state_do_callback( iclog->ic_state = XLOG_STATE_CALLBACK; - LOG_UNLOCK(log, s); - - /* l_last_sync_lsn field protected by - * GRANT_LOCK. Don't worry about iclog's lsn. - * No one else can be here except us. - */ - s = GRANT_LOCK(log); - ASSERT(XFS_LSN_CMP( - log->l_last_sync_lsn, - INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) - )<=0); - log->l_last_sync_lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); - GRANT_UNLOCK(log, s); /* - * Keep processing entries in the callback list - * until we come around and it is empty. We - * need to atomically see that the list is - * empty and change the state to DIRTY so that - * we don't miss any more callbacks being added. + * Completion of a iclog IO does not imply that + * a transaction has completed, as transactions + * can be large enough to span many iclogs. We + * cannot change the tail of the log half way + * through a transaction as this may be the only + * transaction in the log and moving th etail to + * point to the middle of it will prevent + * recovery from finding the start of the + * transaction. Hence we should only update the + * last_sync_lsn if this iclog contains + * transaction completion callbacks on it. + * + * We have to do this before we drop the + * icloglock to ensure we are the only one that + * can update it. */ - s = LOG_LOCK(log); - } else { + ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn), + be64_to_cpu(iclog->ic_header.h_lsn)) <= 0); + if (iclog->ic_callback) + atomic64_set(&log->l_last_sync_lsn, + be64_to_cpu(iclog->ic_header.h_lsn)); + + } else ioerrors++; - } - cb = iclog->ic_callback; - while (cb != 0) { + spin_unlock(&log->l_icloglock); + + /* + * Keep processing entries in the callback list until + * we come around and it is empty. We need to + * atomically see that the list is empty and change the + * state to DIRTY so that we don't miss any more + * callbacks being added. + */ + spin_lock(&iclog->ic_callback_lock); + cb = iclog->ic_callback; + while (cb) { iclog->ic_callback_tail = &(iclog->ic_callback); iclog->ic_callback = NULL; - LOG_UNLOCK(log, s); + spin_unlock(&iclog->ic_callback_lock); /* perform callbacks in the order given */ - for (; cb != 0; cb = cb_next) { + for (; cb; cb = cb_next) { cb_next = cb->cb_next; cb->cb_func(cb->cb_arg, aborted); } - s = LOG_LOCK(log); + spin_lock(&iclog->ic_callback_lock); cb = iclog->ic_callback; } loopdidcallbacks++; funcdidcallbacks++; - ASSERT(iclog->ic_callback == 0); + spin_lock(&log->l_icloglock); + ASSERT(iclog->ic_callback == NULL); + spin_unlock(&iclog->ic_callback_lock); if (!(iclog->ic_state & XLOG_STATE_IOERROR)) iclog->ic_state = XLOG_STATE_DIRTY; @@ -2208,13 +2719,17 @@ xlog_state_do_callback( xlog_state_clean_log(log); /* wake up threads waiting in xfs_log_force() */ - sv_broadcast(&iclog->ic_forcesema); + wake_up_all(&iclog->ic_force_wait); iclog = iclog->ic_next; } while (first_iclog != iclog); - if (repeats && (repeats % 10) == 0) { - xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_state_do_callback: looping %d", repeats); + + if (repeats > 5000) { + flushcnt += repeats; + repeats = 0; + xfs_warn(log->l_mp, + "%s: possible infinite loop (%d iterations)", + __func__, flushcnt); } } while (!ioerrors && loopdidcallbacks); @@ -2233,7 +2748,7 @@ xlog_state_do_callback( * * SYNCING - i/o completion will go through logs * DONE_SYNC - interrupt thread should be waiting for - * LOG_LOCK + * l_icloglock * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || @@ -2246,14 +2761,13 @@ xlog_state_do_callback( } #endif - if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) { - flushcnt = log->l_flushcnt; - log->l_flushcnt = 0; - } - LOG_UNLOCK(log, s); - while (flushcnt--) - vsema(&log->l_flushsema); -} /* xlog_state_do_callback */ + if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) + wake = 1; + spin_unlock(&log->l_icloglock); + + if (wake) + wake_up_all(&log->l_flush_wait); +} /* @@ -2267,22 +2781,20 @@ xlog_state_do_callback( * the second completion goes through. * * Callbacks could take time, so they are done outside the scope of the - * global state machine log lock. Assume that the calls to cvsema won't - * take a long time. At least we know it won't sleep. + * global state machine log lock. */ -void +STATIC void xlog_state_done_syncing( xlog_in_core_t *iclog, int aborted) { - xlog_t *log = iclog->ic_log; - SPLDECL(s); + struct xlog *log = iclog->ic_log; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_IOERROR); - ASSERT(iclog->ic_refcnt == 0); + ASSERT(atomic_read(&iclog->ic_refcnt) == 0); ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); @@ -2294,7 +2806,7 @@ xlog_state_done_syncing( */ if (iclog->ic_state != XLOG_STATE_IOERROR) { if (--iclog->ic_bwritecnt == 1) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return; } iclog->ic_state = XLOG_STATE_DONE_SYNC; @@ -2305,19 +2817,17 @@ xlog_state_done_syncing( * iclog buffer, we wake them all, one will get to do the * I/O, the others get to wait for the result. */ - sv_broadcast(&iclog->ic_writesema); - LOG_UNLOCK(log, s); + wake_up_all(&iclog->ic_write_wait); + spin_unlock(&log->l_icloglock); xlog_state_do_callback(log, aborted, iclog); /* also cleans log */ } /* xlog_state_done_syncing */ /* * If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must - * sleep. The flush semaphore is set to the number of in-core buffers and - * decremented around disk syncing. Therefore, if all buffers are syncing, - * this semaphore will cause new writes to sleep until a sync completes. - * Otherwise, this code just does p() followed by v(). This approximates - * a sleep/wakeup except we can't race. + * sleep. We wait on the flush queue on the head iclog as that should be + * the first iclog to complete flushing. Hence if all iclogs are syncing, + * we will wait here and all new writes will sleep until a sync completes. * * The in-core logs are used in a circular fashion. They are not used * out-of-order even when an iclog past the head is free. @@ -2331,41 +2841,39 @@ xlog_state_done_syncing( * needs to be incremented, depending on the amount of data which * is copied. */ -int -xlog_state_get_iclog_space(xlog_t *log, - int len, - xlog_in_core_t **iclogp, - xlog_ticket_t *ticket, - int *continued_write, - int *logoffsetp) -{ - SPLDECL(s); +STATIC int +xlog_state_get_iclog_space( + struct xlog *log, + int len, + struct xlog_in_core **iclogp, + struct xlog_ticket *ticket, + int *continued_write, + int *logoffsetp) +{ int log_offset; xlog_rec_header_t *head; xlog_in_core_t *iclog; int error; restart: - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); if (XLOG_FORCED_SHUTDOWN(log)) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } iclog = log->l_iclog; - if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) { - log->l_flushcnt++; - LOG_UNLOCK(log, s); - xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH); + if (iclog->ic_state != XLOG_STATE_ACTIVE) { XFS_STATS_INC(xs_log_noiclogs); - /* Ensure that log writes happen */ - psema(&log->l_flushsema, PINOD); + + /* Wait for log writes to have flushed */ + xlog_wait(&log->l_flush_wait, &log->l_icloglock); goto restart; } - ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); + head = &iclog->ic_header; - iclog->ic_refcnt++; /* prevents sync */ + atomic_inc(&iclog->ic_refcnt); /* prevents sync */ log_offset = iclog->ic_offset; /* On the 1st write to an iclog, figure out lsn. This works @@ -2375,11 +2883,12 @@ restart: */ if (log_offset == 0) { ticket->t_curr_res -= log->l_iclog_hsize; - XLOG_TIC_ADD_REGION(ticket, + xlog_tic_add_region(ticket, log->l_iclog_hsize, XLOG_REG_TYPE_LRHEADER); - INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle); - ASSIGN_LSN(head->h_lsn, log); + head->h_cycle = cpu_to_be32(log->l_curr_cycle); + head->h_lsn = cpu_to_be64( + xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block)); ASSERT(log->l_curr_block >= 0); } @@ -2395,14 +2904,21 @@ restart: if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) { xlog_state_switch_iclogs(log, iclog, iclog->ic_size); - /* If I'm the only one writing to this iclog, sync it to disk */ - if (iclog->ic_refcnt == 1) { - LOG_UNLOCK(log, s); - if ((error = xlog_state_release_iclog(log, iclog))) + /* + * If I'm the only one writing to this iclog, sync it to disk. + * We need to do an atomic compare and decrement here to avoid + * racing with concurrent atomic_dec_and_lock() calls in + * xlog_state_release_iclog() when there is more than one + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) { + /* we are the only one */ + spin_unlock(&log->l_icloglock); + error = xlog_state_release_iclog(log, iclog); + if (error) return error; } else { - iclog->ic_refcnt--; - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } goto restart; } @@ -2423,266 +2939,12 @@ restart: *iclogp = iclog; ASSERT(iclog->ic_offset <= iclog->ic_size); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); *logoffsetp = log_offset; return 0; } /* xlog_state_get_iclog_space */ -/* - * Atomically get the log space required for a log ticket. - * - * Once a ticket gets put onto the reserveq, it will only return after - * the needed reservation is satisfied. - */ -STATIC int -xlog_grant_log_space(xlog_t *log, - xlog_ticket_t *tic) -{ - int free_bytes; - int need_bytes; - SPLDECL(s); -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif - - -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("grant Recovery problem"); -#endif - - /* Is there space or do we need to sleep? */ - s = GRANT_LOCK(log); - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: enter"); - - /* something is already sleeping; insert new transaction at end */ - if (log->l_reserve_headq) { - xlog_ins_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: sleep 1"); - /* - * Gotta check this before going to sleep, while we're - * holding the grant lock. - */ - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); - /* - * If we got an error, and the filesystem is shutting down, - * we'll catch it down below. So just continue... - */ - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: wake 1"); - s = GRANT_LOCK(log); - } - if (tic->t_flags & XFS_LOG_PERM_RESERV) - need_bytes = tic->t_unit_res*tic->t_ocnt; - else - need_bytes = tic->t_unit_res; - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - free_bytes = xlog_space_left(log, log->l_grant_reserve_cycle, - log->l_grant_reserve_bytes); - if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: sleep 2"); - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); - - if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); - goto error_return; - } - - xlog_trace_loggrant(log, tic, - "xlog_grant_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - s = GRANT_LOCK(log); - goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - - /* we've got enough space */ - xlog_grant_add_space(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - /* - * Check to make sure the grant write head didn't just over lap the - * tail. If the cycles are the same, we can't be overlapping. - * Otherwise, make sure that the cycles differ by exactly one and - * check the byte count. - */ - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); - } -#endif - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: exit"); - xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); - return 0; - - error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, "xlog_grant_log_space: err_ret"); - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - GRANT_UNLOCK(log, s); - return XFS_ERROR(EIO); -} /* xlog_grant_log_space */ - - -/* - * Replenish the byte reservation required by moving the grant write head. - * - * - */ -STATIC int -xlog_regrant_write_log_space(xlog_t *log, - xlog_ticket_t *tic) -{ - SPLDECL(s); - int free_bytes, need_bytes; - xlog_ticket_t *ntic; -#ifdef DEBUG - xfs_lsn_t tail_lsn; -#endif - - tic->t_curr_res = tic->t_unit_res; - XLOG_TIC_RESET_RES(tic); - - if (tic->t_cnt > 0) - return 0; - -#ifdef DEBUG - if (log->l_flags & XLOG_ACTIVE_RECOVERY) - panic("regrant Recovery problem"); -#endif - - s = GRANT_LOCK(log); - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: enter"); - - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - /* If there are other waiters on the queue then give them a - * chance at logspace before us. Wake up the first waiters, - * if we do not wake up all the waiters then go to sleep waiting - * for more free space, otherwise try to get some space for - * this transaction. - */ - - if ((ntic = log->l_write_headq)) { - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); - do { - ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV); - - if (free_bytes < ntic->t_unit_res) - break; - free_bytes -= ntic->t_unit_res; - sv_signal(&ntic->t_sema); - ntic = ntic->t_next; - } while (ntic != log->l_write_headq); - - if (ntic != log->l_write_headq) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); - - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: sleep 1"); - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, - &log->l_grant_lock, s); - - /* If we're shutting down, this tic is already - * off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); - goto error_return; - } - - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: wake 1"); - xlog_grant_push_ail(log->l_mp, tic->t_unit_res); - s = GRANT_LOCK(log); - } - } - - need_bytes = tic->t_unit_res; - -redo: - if (XLOG_FORCED_SHUTDOWN(log)) - goto error_return; - - free_bytes = xlog_space_left(log, log->l_grant_write_cycle, - log->l_grant_write_bytes); - if (free_bytes < need_bytes) { - if ((tic->t_flags & XLOG_TIC_IN_Q) == 0) - xlog_ins_ticketq(&log->l_write_headq, tic); - XFS_STATS_INC(xs_sleep_logspace); - sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s); - - /* If we're shutting down, this tic is already off the queue */ - if (XLOG_FORCED_SHUTDOWN(log)) { - s = GRANT_LOCK(log); - goto error_return; - } - - xlog_trace_loggrant(log, tic, - "xlog_regrant_write_log_space: wake 2"); - xlog_grant_push_ail(log->l_mp, need_bytes); - s = GRANT_LOCK(log); - goto redo; - } else if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_write_headq, tic); - - /* we've got enough space */ - xlog_grant_add_space_write(log, need_bytes); -#ifdef DEBUG - tail_lsn = log->l_tail_lsn; - if (CYCLE_LSN(tail_lsn) != log->l_grant_write_cycle) { - ASSERT(log->l_grant_write_cycle-1 == CYCLE_LSN(tail_lsn)); - ASSERT(log->l_grant_write_bytes <= BBTOB(BLOCK_LSN(tail_lsn))); - } -#endif - - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: exit"); - xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); - return 0; - - - error_return: - if (tic->t_flags & XLOG_TIC_IN_Q) - xlog_del_ticketq(&log->l_reserve_headq, tic); - xlog_trace_loggrant(log, tic, "xlog_regrant_write_log_space: err_ret"); - /* - * If we are failing, make sure the ticket doesn't have any - * current reservations. We don't want to add this back when - * the ticket/transaction gets cancelled. - */ - tic->t_curr_res = 0; - tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */ - GRANT_UNLOCK(log, s); - return XFS_ERROR(EIO); -} /* xlog_regrant_write_log_space */ - - /* The first cnt-1 times through here we don't need to * move the grant write head because the permanent * reservation has reserved cnt times the unit amount. @@ -2691,37 +2953,35 @@ redo: * move grant reservation head forward. */ STATIC void -xlog_regrant_reserve_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_regrant_reserve_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { - SPLDECL(s); + trace_xfs_log_regrant_reserve_enter(log, ticket); - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: enter"); if (ticket->t_cnt > 0) ticket->t_cnt--; - s = GRANT_LOCK(log); - xlog_grant_sub_space(log, ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_reserve_head.grant, + ticket->t_curr_res); + xlog_grant_sub_space(log, &log->l_write_head.grant, + ticket->t_curr_res); ticket->t_curr_res = ticket->t_unit_res; - XLOG_TIC_RESET_RES(ticket); - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: sub current res"); - xlog_verify_grant_head(log, 1); + xlog_tic_reset_res(ticket); + + trace_xfs_log_regrant_reserve_sub(log, ticket); /* just return if we still have some of the pre-reserved space */ - if (ticket->t_cnt > 0) { - GRANT_UNLOCK(log, s); + if (ticket->t_cnt > 0) return; - } - xlog_grant_add_space_reserve(log, ticket->t_unit_res); - xlog_trace_loggrant(log, ticket, - "xlog_regrant_reserve_log_space: exit"); - xlog_verify_grant_head(log, 0); - GRANT_UNLOCK(log, s); + xlog_grant_add_space(log, &log->l_reserve_head.grant, + ticket->t_unit_res); + + trace_xfs_log_regrant_reserve_exit(log, ticket); + ticket->t_curr_res = ticket->t_unit_res; - XLOG_TIC_RESET_RES(ticket); + xlog_tic_reset_res(ticket); } /* xlog_regrant_reserve_log_space */ @@ -2740,49 +3000,35 @@ xlog_regrant_reserve_log_space(xlog_t *log, * in the current reservation field. */ STATIC void -xlog_ungrant_log_space(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ungrant_log_space( + struct xlog *log, + struct xlog_ticket *ticket) { - SPLDECL(s); + int bytes; if (ticket->t_cnt > 0) ticket->t_cnt--; - s = GRANT_LOCK(log); - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: enter"); + trace_xfs_log_ungrant_enter(log, ticket); + trace_xfs_log_ungrant_sub(log, ticket); - xlog_grant_sub_space(log, ticket->t_curr_res); - - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: sub current"); - - /* If this is a permanent reservation ticket, we may be able to free + /* + * If this is a permanent reservation ticket, we may be able to free * up more space based on the remaining count. */ + bytes = ticket->t_curr_res; if (ticket->t_cnt > 0) { ASSERT(ticket->t_flags & XLOG_TIC_PERM_RESERV); - xlog_grant_sub_space(log, ticket->t_unit_res*ticket->t_cnt); + bytes += ticket->t_unit_res*ticket->t_cnt; } - xlog_trace_loggrant(log, ticket, "xlog_ungrant_log_space: exit"); - xlog_verify_grant_head(log, 1); - GRANT_UNLOCK(log, s); - xfs_log_move_tail(log->l_mp, 1); -} /* xlog_ungrant_log_space */ + xlog_grant_sub_space(log, &log->l_reserve_head.grant, bytes); + xlog_grant_sub_space(log, &log->l_write_head.grant, bytes); + trace_xfs_log_ungrant_exit(log, ticket); -/* - * Atomically put back used ticket. - */ -void -xlog_state_put_ticket(xlog_t *log, - xlog_ticket_t *tic) -{ - unsigned long s; - - s = LOG_LOCK(log); - xlog_ticket_put(log, tic); - LOG_UNLOCK(log, s); -} /* xlog_state_put_ticket */ + xfs_log_space_wake(log->l_mp); +} /* * Flush iclog to disk if this is the last reference to the given iclog and @@ -2793,36 +3039,37 @@ xlog_state_put_ticket(xlog_t *log, * * */ -int -xlog_state_release_iclog(xlog_t *log, - xlog_in_core_t *iclog) +STATIC int +xlog_state_release_iclog( + struct xlog *log, + struct xlog_in_core *iclog) { - SPLDECL(s); int sync = 0; /* do we sync? */ - xlog_assign_tail_lsn(log->l_mp); + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - s = LOG_LOCK(log); + ASSERT(atomic_read(&iclog->ic_refcnt) > 0); + if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) + return 0; if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } - - ASSERT(iclog->ic_refcnt > 0); ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_WANT_SYNC); - if (--iclog->ic_refcnt == 0 && - iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if (iclog->ic_state == XLOG_STATE_WANT_SYNC) { + /* update tail before writing to iclog */ + xfs_lsn_t tail_lsn = xlog_assign_tail_lsn(log->l_mp); sync++; iclog->ic_state = XLOG_STATE_SYNCING; - INT_SET(iclog->ic_header.h_tail_lsn, ARCH_CONVERT, log->l_tail_lsn); - xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog, tail_lsn); /* cycle incremented when incrementing curr_block */ } - - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); /* * We let the log lock go, so it's possible that we hit a log I/O @@ -2831,11 +3078,9 @@ xlog_state_release_iclog(xlog_t *log, * this iclog has consistent data, so we ignore IOERROR * flags after this point. */ - if (sync) { + if (sync) return xlog_sync(log, iclog); - } return 0; - } /* xlog_state_release_iclog */ @@ -2847,15 +3092,16 @@ xlog_state_release_iclog(xlog_t *log, * that every data block. We have run out of space in this log record. */ STATIC void -xlog_state_switch_iclogs(xlog_t *log, - xlog_in_core_t *iclog, - int eventual_size) +xlog_state_switch_iclogs( + struct xlog *log, + struct xlog_in_core *iclog, + int eventual_size) { ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); if (!eventual_size) eventual_size = iclog->ic_offset; iclog->ic_state = XLOG_STATE_WANT_SYNC; - INT_SET(iclog->ic_header.h_prev_block, ARCH_CONVERT, log->l_prev_block); + iclog->ic_header.h_prev_block = cpu_to_be32(log->l_prev_block); log->l_prev_block = log->l_curr_block; log->l_prev_cycle = log->l_curr_cycle; @@ -2863,7 +3109,7 @@ xlog_state_switch_iclogs(xlog_t *log, log->l_curr_block += BTOBB(eventual_size)+BTOBB(log->l_iclog_hsize); /* Round up to next log-sunit */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb) && log->l_mp->m_sb.sb_logsunit > 1) { __uint32_t sunit_bb = BTOBB(log->l_mp->m_sb.sb_logsunit); log->l_curr_block = roundup(log->l_curr_block, sunit_bb); @@ -2880,7 +3126,6 @@ xlog_state_switch_iclogs(xlog_t *log, log->l_iclog = iclog->ic_next; } /* xlog_state_switch_iclogs */ - /* * Write out all data in the in-core log as of this exact moment in time. * @@ -2897,7 +3142,7 @@ xlog_state_switch_iclogs(xlog_t *log, * 2. the current iclog is drity, and the previous iclog is in the * active or dirty state. * - * We may sleep (call psema) if: + * We may sleep if: * * 1. the current iclog is not in the active nor dirty state. * 2. the current iclog dirty, and the previous iclog is not in the @@ -2908,18 +3153,25 @@ xlog_state_switch_iclogs(xlog_t *log, * b) when we return from flushing out this iclog, it is still * not in the active nor dirty state. */ -STATIC int -xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) +int +_xfs_log_force( + struct xfs_mount *mp, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - xfs_lsn_t lsn; - SPLDECL(s); + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + xfs_lsn_t lsn; + + XFS_STATS_INC(xs_log_force); + + xlog_cil_force(log); - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } @@ -2936,7 +3188,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) * previous iclog and go to sleep. */ if (iclog->ic_state == XLOG_STATE_DIRTY || - (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { + (atomic_read(&iclog->ic_refcnt) == 0 + && iclog->ic_offset == 0)) { iclog = iclog->ic_prev; if (iclog->ic_state == XLOG_STATE_ACTIVE || iclog->ic_state == XLOG_STATE_DIRTY) @@ -2944,23 +3197,25 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed) else goto maybe_sleep; } else { - if (iclog->ic_refcnt == 0) { + if (atomic_read(&iclog->ic_refcnt) == 0) { /* We are the only one with access to this * iclog. Flush it out now. There should * be a roundoff of zero to show that someone * has already taken care of the roundoff from * the previous sync. */ - iclog->ic_refcnt++; - lsn = INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT); + atomic_inc(&iclog->ic_refcnt); + lsn = be64_to_cpu(iclog->ic_header.h_lsn); xlog_state_switch_iclogs(log, iclog, 0); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; - s = LOG_LOCK(log); - if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) == lsn && + + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); + if (be64_to_cpu(iclog->ic_header.h_lsn) == lsn && iclog->ic_state != XLOG_STATE_DIRTY) goto maybe_sleep; else @@ -2985,16 +3240,16 @@ maybe_sleep: if (flags & XFS_LOG_SYNC) { /* * We must check if we're shutting down here, before - * we wait, while we're holding the LOG_LOCK. + * we wait, while we're holding the l_icloglock. * Then we check again after waking up, in case our * sleep was disturbed by a bad news. */ if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); return XFS_ERROR(EIO); } XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're * only deciding whether or not to return EIO @@ -3002,19 +3257,36 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return XFS_ERROR(EIO); - *log_flushed = 1; - + if (log_flushed) + *log_flushed = 1; } else { no_sleep: - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } return 0; -} /* xlog_state_sync_all */ +} +/* + * Wrapper for _xfs_log_force(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force( + xfs_mount_t *mp, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, 0); + error = _xfs_log_force(mp, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} /* - * Used by code which implements synchronous log forces. + * Force the in-core log to disk for a specific LSN. * * Find in-core log with lsn. * If it is in the DIRTY state, just return. @@ -3022,122 +3294,156 @@ no_sleep: * state and go to sleep or return. * If it is in any other state, go to sleep or return. * - * If filesystem activity goes to zero, the iclog will get flushed only by - * bdflush(). + * Synchronous forces are implemented with a signal variable. All callers + * to force a given lsn to disk will wait on a the sv attached to the + * specific in-core log. When given in-core log finally completes its + * write to disk, that thread will wake up all threads waiting on the + * sv. */ int -xlog_state_sync(xlog_t *log, - xfs_lsn_t lsn, - uint flags, - int *log_flushed) +_xfs_log_force_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_flushed) { - xlog_in_core_t *iclog; - int already_slept = 0; - SPLDECL(s); + struct xlog *log = mp->m_log; + struct xlog_in_core *iclog; + int already_slept = 0; + ASSERT(lsn != 0); -try_again: - s = LOG_LOCK(log); - iclog = log->l_iclog; + XFS_STATS_INC(xs_log_force); - if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); - return XFS_ERROR(EIO); - } + lsn = xlog_cil_force_lsn(log, lsn); + if (lsn == NULLCOMMITLSN) + return 0; - do { - if (INT_GET(iclog->ic_header.h_lsn, ARCH_CONVERT) != lsn) { - iclog = iclog->ic_next; - continue; +try_again: + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); } - if (iclog->ic_state == XLOG_STATE_DIRTY) { - LOG_UNLOCK(log, s); - return 0; - } + do { + if (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { + iclog = iclog->ic_next; + continue; + } - if (iclog->ic_state == XLOG_STATE_ACTIVE) { - /* - * We sleep here if we haven't already slept (e.g. - * this is the first time we've looked at the correct - * iclog buf) and the buffer before us is going to - * be sync'ed. The reason for this is that if we - * are doing sync transactions here, by waiting for - * the previous I/O to complete, we can allow a few - * more transactions into this iclog before we close - * it down. - * - * Otherwise, we mark the buffer WANT_SYNC, and bump - * up the refcnt so we can release the log (which drops - * the ref count). The state switch keeps new transaction - * commits from using this buffer. When the current commits - * finish writing into the buffer, the refcount will drop to - * zero and the buffer will go out then. - */ - if (!already_slept && - (iclog->ic_prev->ic_state & (XLOG_STATE_WANT_SYNC | - XLOG_STATE_SYNCING))) { - ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_prev->ic_writesema, PSWP, - &log->l_icloglock, s); - *log_flushed = 1; - already_slept = 1; - goto try_again; - } else { - iclog->ic_refcnt++; + if (iclog->ic_state == XLOG_STATE_DIRTY) { + spin_unlock(&log->l_icloglock); + return 0; + } + + if (iclog->ic_state == XLOG_STATE_ACTIVE) { + /* + * We sleep here if we haven't already slept (e.g. + * this is the first time we've looked at the correct + * iclog buf) and the buffer before us is going to + * be sync'ed. The reason for this is that if we + * are doing sync transactions here, by waiting for + * the previous I/O to complete, we can allow a few + * more transactions into this iclog before we close + * it down. + * + * Otherwise, we mark the buffer WANT_SYNC, and bump + * up the refcnt so we can release the log (which + * drops the ref count). The state switch keeps new + * transaction commits from using this buffer. When + * the current commits finish writing into the buffer, + * the refcount will drop to zero and the buffer will + * go out then. + */ + if (!already_slept && + (iclog->ic_prev->ic_state & + (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { + ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); + + XFS_STATS_INC(xs_log_force_sleep); + + xlog_wait(&iclog->ic_prev->ic_write_wait, + &log->l_icloglock); + if (log_flushed) + *log_flushed = 1; + already_slept = 1; + goto try_again; + } + atomic_inc(&iclog->ic_refcnt); xlog_state_switch_iclogs(log, iclog, 0); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); if (xlog_state_release_iclog(log, iclog)) return XFS_ERROR(EIO); - *log_flushed = 1; - s = LOG_LOCK(log); + if (log_flushed) + *log_flushed = 1; + spin_lock(&log->l_icloglock); } - } - if ((flags & XFS_LOG_SYNC) && /* sleep */ - !(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + if ((flags & XFS_LOG_SYNC) && /* sleep */ + !(iclog->ic_state & + (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) { + /* + * Don't wait on completion if we know that we've + * gotten a log write error. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) { + spin_unlock(&log->l_icloglock); + return XFS_ERROR(EIO); + } + XFS_STATS_INC(xs_log_force_sleep); + xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); + /* + * No need to grab the log lock here since we're + * only deciding whether or not to return EIO + * and the memory read should be atomic. + */ + if (iclog->ic_state & XLOG_STATE_IOERROR) + return XFS_ERROR(EIO); - /* - * Don't wait on the forcesema if we know that we've - * gotten a log write error. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) { - LOG_UNLOCK(log, s); - return XFS_ERROR(EIO); + if (log_flushed) + *log_flushed = 1; + } else { /* just return */ + spin_unlock(&log->l_icloglock); } - XFS_STATS_INC(xs_log_force_sleep); - sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s); - /* - * No need to grab the log lock here since we're - * only deciding whether or not to return EIO - * and the memory read should be atomic. - */ - if (iclog->ic_state & XLOG_STATE_IOERROR) - return XFS_ERROR(EIO); - *log_flushed = 1; - } else { /* just return */ - LOG_UNLOCK(log, s); - } - return 0; - } while (iclog != log->l_iclog); + return 0; + } while (iclog != log->l_iclog); - LOG_UNLOCK(log, s); - return 0; -} /* xlog_state_sync */ + spin_unlock(&log->l_icloglock); + return 0; +} +/* + * Wrapper for _xfs_log_force_lsn(), to be used when caller doesn't care + * about errors or whether the log was flushed or not. This is the normal + * interface to use when trying to unpin items or move the log forward. + */ +void +xfs_log_force_lsn( + xfs_mount_t *mp, + xfs_lsn_t lsn, + uint flags) +{ + int error; + + trace_xfs_log_force(mp, lsn); + error = _xfs_log_force_lsn(mp, lsn, flags, NULL); + if (error) + xfs_warn(mp, "%s: error %d returned.", __func__, error); +} /* * Called when we want to mark the current iclog as being ready to sync to * disk. */ -void -xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) +STATIC void +xlog_state_want_sync( + struct xlog *log, + struct xlog_in_core *iclog) { - SPLDECL(s); - - s = LOG_LOCK(log); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); @@ -3145,10 +3451,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) ASSERT(iclog->ic_state & (XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR)); } - - LOG_UNLOCK(log, s); -} /* xlog_state_want_sync */ - +} /***************************************************************************** @@ -3159,120 +3462,38 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) */ /* - * Algorithm doesn't take into account page size. ;-( + * Free a used ticket when its refcount falls to zero. */ -STATIC void -xlog_state_ticket_alloc(xlog_t *log) +void +xfs_log_ticket_put( + xlog_ticket_t *ticket) { - xlog_ticket_t *t_list; - xlog_ticket_t *next; - xfs_caddr_t buf; - uint i = (NBPP / sizeof(xlog_ticket_t)) - 2; - SPLDECL(s); - - /* - * The kmem_zalloc may sleep, so we shouldn't be holding the - * global lock. XXXmiken: may want to use zone allocator. - */ - buf = (xfs_caddr_t) kmem_zalloc(NBPP, KM_SLEEP); - - s = LOG_LOCK(log); - - /* Attach 1st ticket to Q, so we can keep track of allocated memory */ - t_list = (xlog_ticket_t *)buf; - t_list->t_next = log->l_unmount_free; - log->l_unmount_free = t_list++; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Next ticket becomes first ticket attached to ticket free list */ - if (log->l_freelist != NULL) { - ASSERT(log->l_tail != NULL); - log->l_tail->t_next = t_list; - } else { - log->l_freelist = t_list; - } - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - - /* Cycle through rest of alloc'ed memory, building up free Q */ - for ( ; i > 0; i--) { - next = t_list + 1; - t_list->t_next = next; - t_list = next; - log->l_ticket_cnt++; - log->l_ticket_tcnt++; - } - t_list->t_next = NULL; - log->l_tail = t_list; - LOG_UNLOCK(log, s); -} /* xlog_state_ticket_alloc */ - + ASSERT(atomic_read(&ticket->t_ref) > 0); + if (atomic_dec_and_test(&ticket->t_ref)) + kmem_zone_free(xfs_log_ticket_zone, ticket); +} -/* - * Put ticket into free list - * - * Assumption: log lock is held around this call. - */ -STATIC void -xlog_ticket_put(xlog_t *log, - xlog_ticket_t *ticket) +xlog_ticket_t * +xfs_log_ticket_get( + xlog_ticket_t *ticket) { - sv_destroy(&ticket->t_sema); - - /* - * Don't think caching will make that much difference. It's - * more important to make debug easier. - */ -#if 0 - /* real code will want to use LIFO for caching */ - ticket->t_next = log->l_freelist; - log->l_freelist = ticket; - /* no need to clear fields */ -#else - /* When we debug, it is easier if tickets are cycled */ - ticket->t_next = NULL; - if (log->l_tail != 0) { - log->l_tail->t_next = ticket; - } else { - ASSERT(log->l_freelist == 0); - log->l_freelist = ticket; - } - log->l_tail = ticket; -#endif /* DEBUG */ - log->l_ticket_cnt++; -} /* xlog_ticket_put */ - + ASSERT(atomic_read(&ticket->t_ref) > 0); + atomic_inc(&ticket->t_ref); + return ticket; +} /* - * Grab ticket off freelist or allocation some more + * Figure out the total log space unit (in bytes) that would be + * required for a log ticket. */ -xlog_ticket_t * -xlog_ticket_get(xlog_t *log, - int unit_bytes, - int cnt, - char client, - uint xflags) -{ - xlog_ticket_t *tic; - uint num_headers; - SPLDECL(s); - - alloc: - if (log->l_freelist == NULL) - xlog_state_ticket_alloc(log); /* potentially sleep */ - - s = LOG_LOCK(log); - if (log->l_freelist == NULL) { - LOG_UNLOCK(log, s); - goto alloc; - } - tic = log->l_freelist; - log->l_freelist = tic->t_next; - if (log->l_freelist == NULL) - log->l_tail = NULL; - log->l_ticket_cnt--; - LOG_UNLOCK(log, s); +int +xfs_log_calc_unit_res( + struct xfs_mount *mp, + int unit_bytes) +{ + struct xlog *log = mp->m_log; + int iclog_space; + uint num_headers; /* * Permanent reservations have up to 'cnt'-1 active log operations @@ -3312,42 +3533,91 @@ xlog_ticket_get(xlog_t *log, /* for start-rec */ unit_bytes += sizeof(xlog_op_header_t); - /* for LR headers */ - num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log); + /* + * for LR headers - the space for data in an iclog is the size minus + * the space used for the headers. If we use the iclog size, then we + * undercalculate the number of headers required. + * + * Furthermore - the addition of op headers for split-recs might + * increase the space required enough to require more log and op + * headers, so take that into account too. + * + * IMPORTANT: This reservation makes the assumption that if this + * transaction is the first in an iclog and hence has the LR headers + * accounted to it, then the remaining space in the iclog is + * exclusively for this transaction. i.e. if the transaction is larger + * than the iclog, it will be the only thing in that iclog. + * Fundamentally, this means we must pass the entire log vector to + * xlog_write to guarantee this. + */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + num_headers = howmany(unit_bytes, iclog_space); + + /* for split-recs - ophdrs added when data split over LRs */ + unit_bytes += sizeof(xlog_op_header_t) * num_headers; + + /* add extra header reservations if we overrun */ + while (!num_headers || + howmany(unit_bytes, iclog_space) > num_headers) { + unit_bytes += sizeof(xlog_op_header_t); + num_headers++; + } unit_bytes += log->l_iclog_hsize * num_headers; /* for commit-rec LR header - note: padding will subsume the ophdr */ unit_bytes += log->l_iclog_hsize; - /* for split-recs - ophdrs added when data split over LRs */ - unit_bytes += sizeof(xlog_op_header_t) * num_headers; - /* for roundoff padding for transaction data and one for commit record */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) && - log->l_mp->m_sb.sb_logsunit > 1) { + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) { /* log su roundoff */ - unit_bytes += 2*log->l_mp->m_sb.sb_logsunit; + unit_bytes += 2 * mp->m_sb.sb_logsunit; } else { /* BB roundoff */ - unit_bytes += 2*BBSIZE; + unit_bytes += 2 * BBSIZE; } - tic->t_unit_res = unit_bytes; - tic->t_curr_res = unit_bytes; + return unit_bytes; +} + +/* + * Allocate and initialise a new log ticket. + */ +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int cnt, + char client, + bool permanent, + xfs_km_flags_t alloc_flags) +{ + struct xlog_ticket *tic; + int unit_res; + + tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags); + if (!tic) + return NULL; + + unit_res = xfs_log_calc_unit_res(log->l_mp, unit_bytes); + + atomic_set(&tic->t_ref, 1); + tic->t_task = current; + INIT_LIST_HEAD(&tic->t_queue); + tic->t_unit_res = unit_res; + tic->t_curr_res = unit_res; tic->t_cnt = cnt; tic->t_ocnt = cnt; - tic->t_tid = (xlog_tid_t)((__psint_t)tic & 0xffffffff); + tic->t_tid = prandom_u32(); tic->t_clientid = client; tic->t_flags = XLOG_TIC_INITED; tic->t_trans_type = 0; - if (xflags & XFS_LOG_PERM_RESERV) + if (permanent) tic->t_flags |= XLOG_TIC_PERM_RESERV; - sv_init(&(tic->t_sema), SV_DEFAULT, "logtick"); - XLOG_TIC_RESET_RES(tic); + xlog_tic_reset_res(tic); return tic; -} /* xlog_ticket_get */ +} /****************************************************************************** @@ -3363,40 +3633,66 @@ xlog_ticket_get(xlog_t *log, * part of the log in case we trash the log structure. */ void -xlog_verify_dest_ptr(xlog_t *log, - __psint_t ptr) +xlog_verify_dest_ptr( + struct xlog *log, + char *ptr) { int i; int good_ptr = 0; - for (i=0; i < log->l_iclog_bufs; i++) { - if (ptr >= (__psint_t)log->l_iclog_bak[i] && - ptr <= (__psint_t)log->l_iclog_bak[i]+log->l_iclog_size) + for (i = 0; i < log->l_iclog_bufs; i++) { + if (ptr >= log->l_iclog_bak[i] && + ptr <= log->l_iclog_bak[i] + log->l_iclog_size) good_ptr++; } - if (! good_ptr) - xlog_panic("xlog_verify_dest_ptr: invalid ptr"); -} /* xlog_verify_dest_ptr */ + if (!good_ptr) + xfs_emerg(log->l_mp, "%s: invalid ptr", __func__); +} + +/* + * Check to make sure the grant write head didn't just over lap the tail. If + * the cycles are the same, we can't be overlapping. Otherwise, make sure that + * the cycles differ by exactly one and check the byte count. + * + * This check is run unlocked, so can give false positives. Rather than assert + * on failures, use a warn-once flag and a panic tag to allow the admin to + * determine if they want to panic the machine when such an error occurs. For + * debug kernels this will have the same effect as using an assert but, unlinke + * an assert, it can be turned off at runtime. + */ STATIC void -xlog_verify_grant_head(xlog_t *log, int equals) +xlog_verify_grant_tail( + struct xlog *log) { - if (log->l_grant_reserve_cycle == log->l_grant_write_cycle) { - if (equals) - ASSERT(log->l_grant_reserve_bytes >= log->l_grant_write_bytes); - else - ASSERT(log->l_grant_reserve_bytes > log->l_grant_write_bytes); - } else { - ASSERT(log->l_grant_reserve_cycle-1 == log->l_grant_write_cycle); - ASSERT(log->l_grant_write_bytes >= log->l_grant_reserve_bytes); - } -} /* xlog_verify_grant_head */ + int tail_cycle, tail_blocks; + int cycle, space; + + xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &space); + xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); + if (tail_cycle != cycle) { + if (cycle - 1 != tail_cycle && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: cycle - 1 != tail_cycle", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + + if (space > BBTOB(tail_blocks) && + !(log->l_flags & XLOG_TAIL_WARN)) { + xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, + "%s: space > BBTOB(tail_blocks)", __func__); + log->l_flags |= XLOG_TAIL_WARN; + } + } +} /* check if it will fit */ STATIC void -xlog_verify_tail_lsn(xlog_t *log, - xlog_in_core_t *iclog, - xfs_lsn_t tail_lsn) +xlog_verify_tail_lsn( + struct xlog *log, + struct xlog_in_core *iclog, + xfs_lsn_t tail_lsn) { int blocks; @@ -3404,16 +3700,16 @@ xlog_verify_tail_lsn(xlog_t *log, blocks = log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) - xlog_panic("xlog_verify_tail_lsn: ran out of log space"); + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } else { ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); if (BLOCK_LSN(tail_lsn) == log->l_prev_block) - xlog_panic("xlog_verify_tail_lsn: tail wrapped"); + xfs_emerg(log->l_mp, "%s: tail wrapped", __func__); blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; if (blocks < BTOBB(iclog->ic_offset) + 1) - xlog_panic("xlog_verify_tail_lsn: ran out of log space"); + xfs_emerg(log->l_mp, "%s: ran out of log space", __func__); } } /* xlog_verify_tail_lsn */ @@ -3433,10 +3729,11 @@ xlog_verify_tail_lsn(xlog_t *log, * the cycle numbers agree with the current cycle number. */ STATIC void -xlog_verify_iclog(xlog_t *log, - xlog_in_core_t *iclog, - int count, - boolean_t syncing) +xlog_verify_iclog( + struct xlog *log, + struct xlog_in_core *iclog, + int count, + bool syncing) { xlog_op_header_t *ophead; xlog_in_core_t *icptr; @@ -3447,74 +3744,75 @@ xlog_verify_iclog(xlog_t *log, __uint8_t clientid; int len, i, j, k, op_len; int idx; - SPLDECL(s); /* check validity of iclog pointers */ - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); icptr = log->l_iclog; - for (i=0; i < log->l_iclog_bufs; i++) { - if (icptr == 0) - xlog_panic("xlog_verify_iclog: invalid ptr"); - icptr = icptr->ic_next; - } + for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next) + ASSERT(icptr); + if (icptr != log->l_iclog) - xlog_panic("xlog_verify_iclog: corrupt iclog ring"); - LOG_UNLOCK(log, s); + xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__); + spin_unlock(&log->l_icloglock); /* check log magic numbers */ - ptr = (xfs_caddr_t) &(iclog->ic_header); - if (INT_GET(*(uint *)ptr, ARCH_CONVERT) != XLOG_HEADER_MAGIC_NUM) - xlog_panic("xlog_verify_iclog: invalid magic num"); + if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&(iclog->ic_header))+count; + ptr = (xfs_caddr_t) &iclog->ic_header; + for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; ptr += BBSIZE) { - if (INT_GET(*(uint *)ptr, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM) - xlog_panic("xlog_verify_iclog: unexpected magic num"); + if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) + xfs_emerg(log->l_mp, "%s: unexpected magic num", + __func__); } /* check fields */ - len = INT_GET(iclog->ic_header.h_num_logops, ARCH_CONVERT); + len = be32_to_cpu(iclog->ic_header.h_num_logops); ptr = iclog->ic_datap; base_ptr = ptr; ophead = (xlog_op_header_t *)ptr; - xhdr = (xlog_in_core_2_t *)&iclog->ic_header; + xhdr = iclog->ic_data; for (i = 0; i < len; i++) { ophead = (xlog_op_header_t *)ptr; /* clientid is only 1 byte */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { + if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - clientid = GET_CLIENT_ID(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + clientid = xlog_get_client_id( + xhdr[j].hic_xheader.xh_cycle_data[k]); } else { - clientid = GET_CLIENT_ID(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + clientid = xlog_get_client_id( + iclog->ic_header.h_cycle_data[idx]); } } if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) - cmn_err(CE_WARN, "xlog_verify_iclog: " - "invalid clientid %d op 0x%p offset 0x%lx", - clientid, ophead, (unsigned long)field_offset); + xfs_warn(log->l_mp, + "%s: invalid clientid %d op 0x%p offset 0x%lx", + __func__, clientid, ophead, + (unsigned long)field_offset); /* check length */ field_offset = (__psint_t) ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); - if (syncing == B_FALSE || (field_offset & 0x1ff)) { - op_len = INT_GET(ophead->oh_len, ARCH_CONVERT); + if (!syncing || (field_offset & 0x1ff)) { + op_len = be32_to_cpu(ophead->oh_len); } else { idx = BTOBBT((__psint_t)&ophead->oh_len - (__psint_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - op_len = INT_GET(xhdr[j].hic_xheader.xh_cycle_data[k], ARCH_CONVERT); + op_len = be32_to_cpu(xhdr[j].hic_xheader.xh_cycle_data[k]); } else { - op_len = INT_GET(iclog->ic_header.h_cycle_data[idx], ARCH_CONVERT); + op_len = be32_to_cpu(iclog->ic_header.h_cycle_data[idx]); } } ptr += sizeof(xlog_op_header_t) + op_len; @@ -3523,11 +3821,11 @@ xlog_verify_iclog(xlog_t *log, #endif /* - * Mark all iclogs IOERROR. LOG_LOCK is held by the caller. + * Mark all iclogs IOERROR. l_icloglock is held by the caller. */ STATIC int xlog_state_ioerror( - xlog_t *log) + struct xlog *log) { xlog_in_core_t *iclog, *ic; @@ -3561,18 +3859,19 @@ xlog_state_ioerror( * c. nothing new gets queued up after (a) and (b) are done. * d. if !logerror, flush the iclogs to disk, then seal them off * for business. + * + * Note: for delayed logging the !logerror case needs to flush the regions + * held in memory out to the iclogs before flushing them to disk. This needs + * to be done before the log is marked as shutdown, otherwise the flush to the + * iclogs will fail. */ int xfs_log_force_umount( struct xfs_mount *mp, int logerror) { - xlog_ticket_t *tic; - xlog_t *log; + struct xlog *log; int retval; - int dummy; - SPLDECL(s); - SPLDECL(s2); log = mp->m_log; @@ -3583,7 +3882,8 @@ xfs_log_force_umount( if (!log || log->l_flags & XLOG_ACTIVE_RECOVERY) { mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - XFS_BUF_DONE(mp->m_sb_bp); + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); return 0; } @@ -3596,15 +3896,25 @@ xfs_log_force_umount( return 1; } retval = 0; + + /* + * Flush the in memory commit item list before marking the log as + * being shut down. We need to do it in this order to ensure all the + * completed transactions are flushed to disk with the xfs_log_force() + * call below. + */ + if (!logerror) + xlog_cil_force(log); + /* - * We must hold both the GRANT lock and the LOG lock, - * before we mark the filesystem SHUTDOWN and wake - * everybody up to tell the bad news. + * mark the filesystem and the as in a shutdown state and wake + * everybody up to tell them the bad news. */ - s = GRANT_LOCK(log); - s2 = LOG_LOCK(log); + spin_lock(&log->l_icloglock); mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - XFS_BUF_DONE(mp->m_sb_bp); + if (mp->m_sb_bp) + XFS_BUF_DONE(mp->m_sb_bp); + /* * This flag is sort of redundant because of the mount flag, but * it's good to maintain the separation between the log and the rest @@ -3618,60 +3928,51 @@ xfs_log_force_umount( */ if (logerror) retval = xlog_state_ioerror(log); - LOG_UNLOCK(log, s2); + spin_unlock(&log->l_icloglock); /* - * We don't want anybody waiting for log reservations - * after this. That means we have to wake up everybody - * queued up on reserve_headq as well as write_headq. - * In addition, we make sure in xlog_{re}grant_log_space - * that we don't enqueue anything once the SHUTDOWN flag - * is set, and this action is protected by the GRANTLOCK. + * We don't want anybody waiting for log reservations after this. That + * means we have to wake up everybody queued up on reserveq as well as + * writeq. In addition, we make sure in xlog_{re}grant_log_space that + * we don't enqueue anything once the SHUTDOWN flag is set, and this + * action is protected by the grant locks. */ - if ((tic = log->l_reserve_headq)) { - do { - sv_signal(&tic->t_sema); - tic = tic->t_next; - } while (tic != log->l_reserve_headq); - } - - if ((tic = log->l_write_headq)) { - do { - sv_signal(&tic->t_sema); - tic = tic->t_next; - } while (tic != log->l_write_headq); - } - GRANT_UNLOCK(log, s); + xlog_grant_head_wake_all(&log->l_reserve_head); + xlog_grant_head_wake_all(&log->l_write_head); - if (! (log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { + if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { ASSERT(!logerror); /* * Force the incore logs to disk before shutting the * log down completely. */ - xlog_state_sync_all(log, XFS_LOG_FORCE|XFS_LOG_SYNC, &dummy); - s2 = LOG_LOCK(log); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); + + spin_lock(&log->l_icloglock); retval = xlog_state_ioerror(log); - LOG_UNLOCK(log, s2); + spin_unlock(&log->l_icloglock); } + /* - * Wake up everybody waiting on xfs_log_force. - * Callback all log item committed functions as if the - * log writes were completed. + * Wake up everybody waiting on xfs_log_force. Wake the CIL push first + * as if the log writes were completed. The abort handling in the log + * item committed callback functions will do this again under lock to + * avoid races. */ + wake_up_all(&log->l_cilp->xc_commit_wait); xlog_state_do_callback(log, XFS_LI_ABORTED, NULL); #ifdef XFSERRORDEBUG { xlog_in_core_t *iclog; - s = LOG_LOCK(log); + spin_lock(&log->l_icloglock); iclog = log->l_iclog; do { ASSERT(iclog->ic_callback == 0); iclog = iclog->ic_next; } while (iclog != log->l_iclog); - LOG_UNLOCK(log, s); + spin_unlock(&log->l_icloglock); } #endif /* return non-zero if log IOERROR transition had already happened */ @@ -3679,7 +3980,8 @@ xfs_log_force_umount( } STATIC int -xlog_iclogs_empty(xlog_t *log) +xlog_iclogs_empty( + struct xlog *log) { xlog_in_core_t *iclog; @@ -3694,3 +3996,4 @@ xlog_iclogs_empty(xlog_t *log) } while (iclog != log->l_iclog); return 1; } + diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index eacb3d4987f..84e0deb95ab 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -18,14 +18,81 @@ #ifndef __XFS_LOG_H__ #define __XFS_LOG_H__ -/* get lsn fields */ +struct xfs_log_vec { + struct xfs_log_vec *lv_next; /* next lv in build list */ + int lv_niovecs; /* number of iovecs in lv */ + struct xfs_log_iovec *lv_iovecp; /* iovec array */ + struct xfs_log_item *lv_item; /* owner */ + char *lv_buf; /* formatted buffer */ + int lv_bytes; /* accounted space in buffer */ + int lv_buf_len; /* aligned size of buffer */ + int lv_size; /* size of allocated lv */ +}; + +#define XFS_LOG_VEC_ORDERED (-1) + +static inline void * +xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type) +{ + struct xfs_log_iovec *vec = *vecp; + + if (vec) { + ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs); + vec++; + } else { + vec = &lv->lv_iovecp[0]; + } + + vec->i_type = type; + vec->i_addr = lv->lv_buf + lv->lv_buf_len; + + ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t))); + + *vecp = vec; + return vec->i_addr; +} + +/* + * We need to make sure the next buffer is naturally aligned for the biggest + * basic data type we put into it. We already accounted for this padding when + * sizing the buffer. + * + * However, this padding does not get written into the log, and hence we have to + * track the space used by the log vectors separately to prevent log space hangs + * due to inaccurate accounting (i.e. a leak) of the used log space through the + * CIL context ticket. + */ +static inline void +xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len) +{ + lv->lv_buf_len += round_up(len, sizeof(uint64_t)); + lv->lv_bytes += len; + vec->i_len = len; +} -#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) -#define BLOCK_LSN(lsn) ((uint)(lsn)) -/* this is used in a spot where we might otherwise double-endian-flip */ -#define CYCLE_LSN_DISK(lsn) (((uint *)&(lsn))[0]) +static inline void * +xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp, + uint type, void *data, int len) +{ + void *buf; + + buf = xlog_prepare_iovec(lv, vecp, type); + memcpy(buf, data, len); + xlog_finish_iovec(lv, *vecp, len); + return buf; +} + +/* + * Structure used to pass callback function and the function's argument + * to the log manager. + */ +typedef struct xfs_log_callback { + struct xfs_log_callback *cb_next; + void (*cb_func)(void *, int); + void *cb_arg; +} xfs_log_callback_t; -#ifdef __KERNEL__ /* * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number @@ -48,145 +115,79 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) */ /* - * Flags to xfs_log_mount - */ -#define XFS_LOG_RECOVER 0x1 - -/* * Flags to xfs_log_done() */ #define XFS_LOG_REL_PERM_RESERV 0x1 - -/* - * Flags to xfs_log_reserve() - * - * XFS_LOG_SLEEP: If space is not available, sleep (default) - * XFS_LOG_NOSLEEP: If space is not available, return error - * XFS_LOG_PERM_RESERV: Permanent reservation. When writes are - * performed against this type of reservation, the reservation - * is not decreased. Long running transactions should use this. - */ -#define XFS_LOG_SLEEP 0x0 -#define XFS_LOG_NOSLEEP 0x1 -#define XFS_LOG_PERM_RESERV 0x2 -#define XFS_LOG_RESV_ALL (XFS_LOG_NOSLEEP|XFS_LOG_PERM_RESERV) - - /* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk - * XFS_LOG_FORCE: Start in-core log write now. - * XFS_LOG_URGE: Start write within some window of time. - * - * Note: Either XFS_LOG_FORCE or XFS_LOG_URGE must be set. */ #define XFS_LOG_SYNC 0x1 -#define XFS_LOG_FORCE 0x2 -#define XFS_LOG_URGE 0x4 - -#endif /* __KERNEL__ */ - - -/* Log Clients */ -#define XFS_TRANSACTION 0x69 -#define XFS_VOLUME 0x2 -#define XFS_LOG 0xaa - - -/* Region types for iovec's i_type */ -#define XLOG_REG_TYPE_BFORMAT 1 -#define XLOG_REG_TYPE_BCHUNK 2 -#define XLOG_REG_TYPE_EFI_FORMAT 3 -#define XLOG_REG_TYPE_EFD_FORMAT 4 -#define XLOG_REG_TYPE_IFORMAT 5 -#define XLOG_REG_TYPE_ICORE 6 -#define XLOG_REG_TYPE_IEXT 7 -#define XLOG_REG_TYPE_IBROOT 8 -#define XLOG_REG_TYPE_ILOCAL 9 -#define XLOG_REG_TYPE_IATTR_EXT 10 -#define XLOG_REG_TYPE_IATTR_BROOT 11 -#define XLOG_REG_TYPE_IATTR_LOCAL 12 -#define XLOG_REG_TYPE_QFORMAT 13 -#define XLOG_REG_TYPE_DQUOT 14 -#define XLOG_REG_TYPE_QUOTAOFF 15 -#define XLOG_REG_TYPE_LRHEADER 16 -#define XLOG_REG_TYPE_UNMOUNT 17 -#define XLOG_REG_TYPE_COMMIT 18 -#define XLOG_REG_TYPE_TRANSHDR 19 -#define XLOG_REG_TYPE_MAX 19 - -#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t)) - -typedef struct xfs_log_iovec { - xfs_caddr_t i_addr; /* beginning address of region */ - int i_len; /* length in bytes of region */ - uint i_type; /* type of region */ -} xfs_log_iovec_t; - -typedef void* xfs_log_ticket_t; - -/* - * Structure used to pass callback function and the function's argument - * to the log manager. - */ -typedef struct xfs_log_callback { - struct xfs_log_callback *cb_next; - void (*cb_func)(void *, int); - void *cb_arg; -} xfs_log_callback_t; - -#ifdef __KERNEL__ /* Log manager interfaces */ struct xfs_mount; +struct xlog_in_core; +struct xlog_ticket; +struct xfs_log_item; +struct xfs_item_ops; +struct xfs_trans; +struct xfs_log_callback; + xfs_lsn_t xfs_log_done(struct xfs_mount *mp, - xfs_log_ticket_t ticket, - void **iclog, + struct xlog_ticket *ticket, + struct xlog_in_core **iclog, uint flags); int _xfs_log_force(struct xfs_mount *mp, - xfs_lsn_t lsn, uint flags, int *log_forced); -#define xfs_log_force(mp, lsn, flags) \ - _xfs_log_force(mp, lsn, flags, NULL); +void xfs_log_force(struct xfs_mount *mp, + uint flags); +int _xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags, + int *log_forced); +void xfs_log_force_lsn(struct xfs_mount *mp, + xfs_lsn_t lsn, + uint flags); int xfs_log_mount(struct xfs_mount *mp, struct xfs_buftarg *log_target, xfs_daddr_t start_block, int num_bblocks); -int xfs_log_mount_finish(struct xfs_mount *mp, int); -void xfs_log_move_tail(struct xfs_mount *mp, - xfs_lsn_t tail_lsn); +int xfs_log_mount_finish(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); +xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp); +void xfs_log_space_wake(struct xfs_mount *mp); int xfs_log_notify(struct xfs_mount *mp, - void *iclog, - xfs_log_callback_t *callback_entry); + struct xlog_in_core *iclog, + struct xfs_log_callback *callback_entry); int xfs_log_release_iclog(struct xfs_mount *mp, - void *iclog_hndl); + struct xlog_in_core *iclog); int xfs_log_reserve(struct xfs_mount *mp, int length, int count, - xfs_log_ticket_t *ticket, + struct xlog_ticket **ticket, __uint8_t clientid, - uint flags, + bool permanent, uint t_type); -int xfs_log_write(struct xfs_mount *mp, - xfs_log_iovec_t region[], - int nentries, - xfs_log_ticket_t ticket, - xfs_lsn_t *start_lsn); -int xfs_log_unmount(struct xfs_mount *mp); +int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); int xfs_log_unmount_write(struct xfs_mount *mp); -void xfs_log_unmount_dealloc(struct xfs_mount *mp); +void xfs_log_unmount(struct xfs_mount *mp); int xfs_log_force_umount(struct xfs_mount *mp, int logerror); int xfs_log_need_covered(struct xfs_mount *mp); void xlog_iodone(struct xfs_buf *); -#endif - +struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); +void xfs_log_ticket_put(struct xlog_ticket *ticket); -extern int xlog_debug; /* set to 1 to enable real log */ +void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, int flags); +bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); +void xfs_log_work_queue(struct xfs_mount *mp); +void xfs_log_worker(struct work_struct *work); +void xfs_log_quiesce(struct xfs_mount *mp); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c new file mode 100644 index 00000000000..b3425b34e3d --- /dev/null +++ b/fs/xfs/xfs_log_cil.c @@ -0,0 +1,972 @@ +/* + * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_error.h" +#include "xfs_alloc.h" +#include "xfs_extent_busy.h" +#include "xfs_discard.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" + +/* + * Allocate a new ticket. Failing to get a new ticket makes it really hard to + * recover, so we don't allow failure here. Also, we allocate in a context that + * we don't want to be issuing transactions from, so we need to tell the + * allocation code this as well. + * + * We don't reserve any space for the ticket - we are going to steal whatever + * space we require from transactions as they commit. To ensure we reserve all + * the space required, we need to set the current reservation of the ticket to + * zero so that we know to steal the initial transaction overhead from the + * first transaction commit. + */ +static struct xlog_ticket * +xlog_cil_ticket_alloc( + struct xlog *log) +{ + struct xlog_ticket *tic; + + tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0, + KM_SLEEP|KM_NOFS); + tic->t_trans_type = XFS_TRANS_CHECKPOINT; + + /* + * set the current reservation to zero so we know to steal the basic + * transaction overhead reservation from the first transaction commit. + */ + tic->t_curr_res = 0; + return tic; +} + +/* + * After the first stage of log recovery is done, we know where the head and + * tail of the log are. We need this log initialisation done before we can + * initialise the first CIL checkpoint context. + * + * Here we allocate a log ticket to track space usage during a CIL push. This + * ticket is passed to xlog_write() directly so that we don't slowly leak log + * space by failing to account for space used by log headers and additional + * region headers for split regions. + */ +void +xlog_cil_init_post_recovery( + struct xlog *log) +{ + log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); + log->l_cilp->xc_ctx->sequence = 1; + log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle, + log->l_curr_block); +} + +/* + * Prepare the log item for insertion into the CIL. Calculate the difference in + * log space and vectors it will consume, and if it is a new item pin it as + * well. + */ +STATIC void +xfs_cil_prepare_item( + struct xlog *log, + struct xfs_log_vec *lv, + struct xfs_log_vec *old_lv, + int *diff_len, + int *diff_iovecs) +{ + /* Account for the new LV being passed in */ + if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) { + *diff_len += lv->lv_bytes; + *diff_iovecs += lv->lv_niovecs; + } + + /* + * If there is no old LV, this is the first time we've seen the item in + * this CIL context and so we need to pin it. If we are replacing the + * old_lv, then remove the space it accounts for and free it. + */ + if (!old_lv) + lv->lv_item->li_ops->iop_pin(lv->lv_item); + else if (old_lv != lv) { + ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); + + *diff_len -= old_lv->lv_bytes; + *diff_iovecs -= old_lv->lv_niovecs; + kmem_free(old_lv); + } + + /* attach new log vector to log item */ + lv->lv_item->li_lv = lv; + + /* + * If this is the first time the item is being committed to the + * CIL, store the sequence number on the log item so we can + * tell in future commits whether this is the first checkpoint + * the item is being committed into. + */ + if (!lv->lv_item->li_seq) + lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; +} + +/* + * Format log item into a flat buffers + * + * For delayed logging, we need to hold a formatted buffer containing all the + * changes on the log item. This enables us to relog the item in memory and + * write it out asynchronously without needing to relock the object that was + * modified at the time it gets written into the iclog. + * + * This function builds a vector for the changes in each log item in the + * transaction. It then works out the length of the buffer needed for each log + * item, allocates them and formats the vector for the item into the buffer. + * The buffer is then attached to the log item are then inserted into the + * Committed Item List for tracking until the next checkpoint is written out. + * + * We don't set up region headers during this process; we simply copy the + * regions into the flat buffer. We can do this because we still have to do a + * formatting step to write the regions into the iclog buffer. Writing the + * ophdrs during the iclog write means that we can support splitting large + * regions across iclog boundares without needing a change in the format of the + * item/region encapsulation. + * + * Hence what we need to do now is change the rewrite the vector array to point + * to the copied region inside the buffer we just allocated. This allows us to + * format the regions into the iclog as though they are being formatted + * directly out of the objects themselves. + */ +static void +xlog_cil_insert_format_items( + struct xlog *log, + struct xfs_trans *tp, + int *diff_len, + int *diff_iovecs) +{ + struct xfs_log_item_desc *lidp; + + + /* Bail out if we didn't find a log item. */ + if (list_empty(&tp->t_items)) { + ASSERT(0); + return; + } + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_vec *lv; + struct xfs_log_vec *old_lv; + int niovecs = 0; + int nbytes = 0; + int buf_size; + bool ordered = false; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + /* get number of vecs and size of data to be stored */ + lip->li_ops->iop_size(lip, &niovecs, &nbytes); + + /* Skip items that do not have any vectors for writing */ + if (!niovecs) + continue; + + /* + * Ordered items need to be tracked but we do not wish to write + * them. We need a logvec to track the object, but we do not + * need an iovec or buffer to be allocated for copying data. + */ + if (niovecs == XFS_LOG_VEC_ORDERED) { + ordered = true; + niovecs = 0; + nbytes = 0; + } + + /* + * We 64-bit align the length of each iovec so that the start + * of the next one is naturally aligned. We'll need to + * account for that slack space here. Then round nbytes up + * to 64-bit alignment so that the initial buffer alignment is + * easy to calculate and verify. + */ + nbytes += niovecs * sizeof(uint64_t); + nbytes = round_up(nbytes, sizeof(uint64_t)); + + /* grab the old item if it exists for reservation accounting */ + old_lv = lip->li_lv; + + /* + * The data buffer needs to start 64-bit aligned, so round up + * that space to ensure we can align it appropriately and not + * overrun the buffer. + */ + buf_size = nbytes + + round_up((sizeof(struct xfs_log_vec) + + niovecs * sizeof(struct xfs_log_iovec)), + sizeof(uint64_t)); + + /* compare to existing item size */ + if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { + /* same or smaller, optimise common overwrite case */ + lv = lip->li_lv; + lv->lv_next = NULL; + + if (ordered) + goto insert; + + /* + * set the item up as though it is a new insertion so + * that the space reservation accounting is correct. + */ + *diff_iovecs -= lv->lv_niovecs; + *diff_len -= lv->lv_bytes; + } else { + /* allocate new data chunk */ + lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); + lv->lv_item = lip; + lv->lv_size = buf_size; + if (ordered) { + /* track as an ordered logvec */ + ASSERT(lip->li_lv == NULL); + lv->lv_buf_len = XFS_LOG_VEC_ORDERED; + goto insert; + } + lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; + } + + /* Ensure the lv is set up according to ->iop_size */ + lv->lv_niovecs = niovecs; + + /* The allocated data region lies beyond the iovec region */ + lv->lv_buf_len = 0; + lv->lv_bytes = 0; + lv->lv_buf = (char *)lv + buf_size - nbytes; + ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); + + lip->li_ops->iop_format(lip, lv); +insert: + ASSERT(lv->lv_buf_len <= nbytes); + xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); + } +} + +/* + * Insert the log items into the CIL and calculate the difference in space + * consumed by the item. Add the space to the checkpoint ticket and calculate + * if the change requires additional log metadata. If it does, take that space + * as well. Remove the amount of space we added to the checkpoint ticket from + * the current transaction ticket so that the accounting works out correctly. + */ +static void +xlog_cil_insert_items( + struct xlog *log, + struct xfs_trans *tp) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx = cil->xc_ctx; + struct xfs_log_item_desc *lidp; + int len = 0; + int diff_iovecs = 0; + int iclog_space; + + ASSERT(tp); + + /* + * We can do this safely because the context can't checkpoint until we + * are done so it doesn't matter exactly how we update the CIL. + */ + xlog_cil_insert_format_items(log, tp, &len, &diff_iovecs); + + /* + * Now (re-)position everything modified at the tail of the CIL. + * We do this here so we only need to take the CIL lock once during + * the transaction commit. + */ + spin_lock(&cil->xc_cil_lock); + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + /* Skip items which aren't dirty in this transaction. */ + if (!(lidp->lid_flags & XFS_LID_DIRTY)) + continue; + + list_move_tail(&lip->li_cil, &cil->xc_cil); + } + + /* account for space used by new iovec headers */ + len += diff_iovecs * sizeof(xlog_op_header_t); + ctx->nvecs += diff_iovecs; + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) + list_splice_init(&tp->t_busy, &ctx->busy_extents); + + /* + * Now transfer enough transaction reservation to the context ticket + * for the checkpoint. The context ticket is special - the unit + * reservation has to grow as well as the current reservation as we + * steal from tickets so we can correctly determine the space used + * during the transaction commit. + */ + if (ctx->ticket->t_curr_res == 0) { + ctx->ticket->t_curr_res = ctx->ticket->t_unit_res; + tp->t_ticket->t_curr_res -= ctx->ticket->t_unit_res; + } + + /* do we need space for more log record headers? */ + iclog_space = log->l_iclog_size - log->l_iclog_hsize; + if (len > 0 && (ctx->space_used / iclog_space != + (ctx->space_used + len) / iclog_space)) { + int hdrs; + + hdrs = (len + iclog_space - 1) / iclog_space; + /* need to take into account split region headers, too */ + hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header); + ctx->ticket->t_unit_res += hdrs; + ctx->ticket->t_curr_res += hdrs; + tp->t_ticket->t_curr_res -= hdrs; + ASSERT(tp->t_ticket->t_curr_res >= len); + } + tp->t_ticket->t_curr_res -= len; + ctx->space_used += len; + + spin_unlock(&cil->xc_cil_lock); +} + +static void +xlog_cil_free_logvec( + struct xfs_log_vec *log_vector) +{ + struct xfs_log_vec *lv; + + for (lv = log_vector; lv; ) { + struct xfs_log_vec *next = lv->lv_next; + kmem_free(lv); + lv = next; + } +} + +/* + * Mark all items committed and clear busy extents. We free the log vector + * chains in a separate pass so that we unpin the log items as quickly as + * possible. + */ +static void +xlog_cil_committed( + void *args, + int abort) +{ + struct xfs_cil_ctx *ctx = args; + struct xfs_mount *mp = ctx->cil->xc_log->l_mp; + + xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, + ctx->start_lsn, abort); + + xfs_extent_busy_sort(&ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, + (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + + /* + * If we are aborting the commit, wake up anyone waiting on the + * committing list. If we don't, then a shutdown we can leave processes + * waiting in xlog_cil_force_lsn() waiting on a sequence commit that + * will never happen because we aborted it. + */ + spin_lock(&ctx->cil->xc_push_lock); + if (abort) + wake_up_all(&ctx->cil->xc_commit_wait); + list_del(&ctx->committing); + spin_unlock(&ctx->cil->xc_push_lock); + + xlog_cil_free_logvec(ctx->lv_chain); + + if (!list_empty(&ctx->busy_extents)) { + ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + + xfs_discard_extents(mp, &ctx->busy_extents); + xfs_extent_busy_clear(mp, &ctx->busy_extents, false); + } + + kmem_free(ctx); +} + +/* + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. + */ +STATIC int +xlog_cil_push( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_log_vec *lv; + struct xfs_cil_ctx *ctx; + struct xfs_cil_ctx *new_ctx; + struct xlog_in_core *commit_iclog; + struct xlog_ticket *tic; + int num_iovecs; + int error = 0; + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; + xfs_lsn_t commit_lsn; + xfs_lsn_t push_seq; + + if (!cil) + return 0; + + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); + new_ctx->ticket = xlog_cil_ticket_alloc(log); + + down_write(&cil->xc_ctx_lock); + ctx = cil->xc_ctx; + + spin_lock(&cil->xc_push_lock); + push_seq = cil->xc_push_seq; + ASSERT(push_seq <= ctx->sequence); + + /* + * Check if we've anything to push. If there is nothing, then we don't + * move on to a new sequence number and so we have to be able to push + * this sequence again later. + */ + if (list_empty(&cil->xc_cil)) { + cil->xc_push_seq = 0; + spin_unlock(&cil->xc_push_lock); + goto out_skip; + } + spin_unlock(&cil->xc_push_lock); + + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) + goto out_skip; + + /* + * pull all the log vectors off the items in the CIL, and + * remove the items from the CIL. We don't need the CIL lock + * here because it's only needed on the transaction commit + * side which is currently locked out by the flush lock. + */ + lv = NULL; + num_iovecs = 0; + while (!list_empty(&cil->xc_cil)) { + struct xfs_log_item *item; + + item = list_first_entry(&cil->xc_cil, + struct xfs_log_item, li_cil); + list_del_init(&item->li_cil); + if (!ctx->lv_chain) + ctx->lv_chain = item->li_lv; + else + lv->lv_next = item->li_lv; + lv = item->li_lv; + item->li_lv = NULL; + num_iovecs += lv->lv_niovecs; + } + + /* + * initialise the new context and attach it to the CIL. Then attach + * the current context to the CIL committing lsit so it can be found + * during log forces to extract the commit lsn of the sequence that + * needs to be forced. + */ + INIT_LIST_HEAD(&new_ctx->committing); + INIT_LIST_HEAD(&new_ctx->busy_extents); + new_ctx->sequence = ctx->sequence + 1; + new_ctx->cil = cil; + cil->xc_ctx = new_ctx; + + /* + * The switch is now done, so we can drop the context lock and move out + * of a shared context. We can't just go straight to the commit record, + * though - we need to synchronise with previous and future commits so + * that the commit records are correctly ordered in the log to ensure + * that we process items during log IO completion in the correct order. + * + * For example, if we get an EFI in one checkpoint and the EFD in the + * next (e.g. due to log forces), we do not want the checkpoint with + * the EFD to be committed before the checkpoint with the EFI. Hence + * we must strictly order the commit records of the checkpoints so + * that: a) the checkpoint callbacks are attached to the iclogs in the + * correct order; and b) the checkpoints are replayed in correct order + * in log recovery. + * + * Hence we need to add this context to the committing context list so + * that higher sequences will wait for us to write out a commit record + * before they do. + * + * xfs_log_force_lsn requires us to mirror the new sequence into the cil + * structure atomically with the addition of this sequence to the + * committing list. This also ensures that we can do unlocked checks + * against the current sequence in log forces without risking + * deferencing a freed context pointer. + */ + spin_lock(&cil->xc_push_lock); + cil->xc_current_sequence = new_ctx->sequence; + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_push_lock); + up_write(&cil->xc_ctx_lock); + + /* + * Build a checkpoint transaction header and write it to the log to + * begin the transaction. We need to account for the space used by the + * transaction header here as it is not accounted for in xlog_write(). + * + * The LSN we need to pass to the log items on transaction commit is + * the LSN reported by the first log vector write. If we use the commit + * record lsn then we can move the tail beyond the grant write head. + */ + tic = ctx->ticket; + thdr.th_magic = XFS_TRANS_HEADER_MAGIC; + thdr.th_type = XFS_TRANS_CHECKPOINT; + thdr.th_tid = tic->t_tid; + thdr.th_num_items = num_iovecs; + lhdr.i_addr = &thdr; + lhdr.i_len = sizeof(xfs_trans_header_t); + lhdr.i_type = XLOG_REG_TYPE_TRANSHDR; + tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t); + + lvhdr.lv_niovecs = 1; + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + + error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0); + if (error) + goto out_abort_free_ticket; + + /* + * now that we've written the checkpoint into the log, strictly + * order the commit records so replay will get them in the right order. + */ +restart: + spin_lock(&cil->xc_push_lock); + list_for_each_entry(new_ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) { + spin_unlock(&cil->xc_push_lock); + goto out_abort_free_ticket; + } + + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for our own sequence, either. + */ + if (new_ctx->sequence >= ctx->sequence) + continue; + if (!new_ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + } + spin_unlock(&cil->xc_push_lock); + + /* xfs_log_done always frees the ticket on error. */ + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + if (commit_lsn == -1) + goto out_abort; + + /* attach all the transactions w/ busy extents to iclog */ + ctx->log_cb.cb_func = xlog_cil_committed; + ctx->log_cb.cb_arg = ctx; + error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb); + if (error) + goto out_abort; + + /* + * now the checkpoint commit is complete and we've attached the + * callbacks to the iclog we can assign the commit LSN to the context + * and wake up anyone who is waiting for the commit to complete. + */ + spin_lock(&cil->xc_push_lock); + ctx->commit_lsn = commit_lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_push_lock); + + /* release the hounds! */ + return xfs_log_release_iclog(log->l_mp, commit_iclog); + +out_skip: + up_write(&cil->xc_ctx_lock); + xfs_log_ticket_put(new_ctx->ticket); + kmem_free(new_ctx); + return 0; + +out_abort_free_ticket: + xfs_log_ticket_put(tic); +out_abort: + xlog_cil_committed(ctx, XFS_LI_ABORTED); + return XFS_ERROR(EIO); +} + +static void +xlog_cil_push_work( + struct work_struct *work) +{ + struct xfs_cil *cil = container_of(work, struct xfs_cil, + xc_push_work); + xlog_cil_push(cil->xc_log); +} + +/* + * We need to push CIL every so often so we don't cache more than we can fit in + * the log. The limit really is that a checkpoint can't be more than half the + * log (the current checkpoint is not allowed to overwrite the previous + * checkpoint), but commit latency and memory usage limit this to a smaller + * size. + */ +static void +xlog_cil_push_background( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + + /* + * The cil won't be empty because we are called while holding the + * context lock so whatever we added to the CIL will still be there + */ + ASSERT(!list_empty(&cil->xc_cil)); + + /* + * don't do a background push if we haven't used up all the + * space available yet. + */ + if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + return; + + spin_lock(&cil->xc_push_lock); + if (cil->xc_push_seq < cil->xc_current_sequence) { + cil->xc_push_seq = cil->xc_current_sequence; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + } + spin_unlock(&cil->xc_push_lock); + +} + +/* + * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence + * number that is passed. When it returns, the work will be queued for + * @push_seq, but it won't be completed. The caller is expected to do any + * waiting for push_seq to complete if it is required. + */ +static void +xlog_cil_push_now( + struct xlog *log, + xfs_lsn_t push_seq) +{ + struct xfs_cil *cil = log->l_cilp; + + if (!cil) + return; + + ASSERT(push_seq && push_seq <= cil->xc_current_sequence); + + /* start on any pending background push to minimise wait time on it */ + flush_work(&cil->xc_push_work); + + /* + * If the CIL is empty or we've already pushed the sequence then + * there's no work we need to do. + */ + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) { + spin_unlock(&cil->xc_push_lock); + return; + } + + cil->xc_push_seq = push_seq; + queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + spin_unlock(&cil->xc_push_lock); +} + +bool +xlog_cil_empty( + struct xlog *log) +{ + struct xfs_cil *cil = log->l_cilp; + bool empty = false; + + spin_lock(&cil->xc_push_lock); + if (list_empty(&cil->xc_cil)) + empty = true; + spin_unlock(&cil->xc_push_lock); + return empty; +} + +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +void +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct xlog *log = mp->m_log; + struct xfs_cil *cil = log->l_cilp; + int log_flags = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + /* lock out background commit */ + down_read(&cil->xc_ctx_lock); + + xlog_cil_insert_items(log, tp); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(mp, tp->t_ticket); + + tp->t_commit_lsn = cil->xc_ctx->sequence; + if (commit_lsn) + *commit_lsn = tp->t_commit_lsn; + + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + + xlog_cil_push_background(log); + + up_read(&cil->xc_ctx_lock); +} + +/* + * Conditionally push the CIL based on the sequence passed in. + * + * We only need to push if we haven't already pushed the sequence + * number given. Hence the only time we will trigger a push here is + * if the push sequence is the same as the current context. + * + * We return the current commit lsn to allow the callers to determine if a + * iclog flush is necessary following this call. + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence) +{ + struct xfs_cil *cil = log->l_cilp; + struct xfs_cil_ctx *ctx; + xfs_lsn_t commit_lsn = NULLCOMMITLSN; + + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ +restart: + xlog_cil_push_now(log, sequence); + + /* + * See if we can find a previous sequence still committing. + * We need to wait for all previous sequence commits to complete + * before allowing the force of push_seq to go ahead. Hence block + * on commits for those as well. + */ + spin_lock(&cil->xc_push_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (XLOG_FORCED_SHUTDOWN(log)) + goto out_shutdown; + if (ctx->sequence > sequence) + continue; + if (!ctx->commit_lsn) { + /* + * It is still being pushed! Wait for the push to + * complete, then start again from the beginning. + */ + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + if (ctx->sequence != sequence) + continue; + /* found it! */ + commit_lsn = ctx->commit_lsn; + } + + /* + * The call to xlog_cil_push_now() executes the push in the background. + * Hence by the time we have got here it our sequence may not have been + * pushed yet. This is true if the current sequence still matches the + * push sequence after the above wait loop and the CIL still contains + * dirty objects. + * + * When the push occurs, it will empty the CIL and atomically increment + * the currect sequence past the push sequence and move it into the + * committing list. Of course, if the CIL is clean at the time of the + * push, it won't have pushed the CIL at all, so in that case we should + * try the push for this sequence again from the start just in case. + */ + if (sequence == cil->xc_current_sequence && + !list_empty(&cil->xc_cil)) { + spin_unlock(&cil->xc_push_lock); + goto restart; + } + + spin_unlock(&cil->xc_push_lock); + return commit_lsn; + + /* + * We detected a shutdown in progress. We need to trigger the log force + * to pass through it's iclog state machine error handling, even though + * we are already in a shutdown state. Hence we can't return + * NULLCOMMITLSN here as that has special meaning to log forces (i.e. + * LSN is already stable), so we return a zero LSN instead. + */ +out_shutdown: + spin_unlock(&cil->xc_push_lock); + return 0; +} + +/* + * Check if the current log item was first committed in this sequence. + * We can't rely on just the log item being in the CIL, we have to check + * the recorded commit sequence number. + * + * Note: for this to be used in a non-racy manner, it has to be called with + * CIL flushing locked out. As a result, it should only be used during the + * transaction commit process when deciding what to format into the item. + */ +bool +xfs_log_item_in_current_chkpt( + struct xfs_log_item *lip) +{ + struct xfs_cil_ctx *ctx; + + if (list_empty(&lip->li_cil)) + return false; + + ctx = lip->li_mountp->m_log->l_cilp->xc_ctx; + + /* + * li_seq is written on the first commit of a log item to record the + * first checkpoint it is written to. Hence if it is different to the + * current sequence, we're in a new checkpoint. + */ + if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0) + return false; + return true; +} + +/* + * Perform initial CIL structure initialisation. + */ +int +xlog_cil_init( + struct xlog *log) +{ + struct xfs_cil *cil; + struct xfs_cil_ctx *ctx; + + cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); + if (!cil) + return ENOMEM; + + ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); + if (!ctx) { + kmem_free(cil); + return ENOMEM; + } + + INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); + INIT_LIST_HEAD(&cil->xc_cil); + INIT_LIST_HEAD(&cil->xc_committing); + spin_lock_init(&cil->xc_cil_lock); + spin_lock_init(&cil->xc_push_lock); + init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_commit_wait); + + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + ctx->sequence = 1; + ctx->cil = cil; + cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; + + cil->xc_log = log; + log->l_cilp = cil; + return 0; +} + +void +xlog_cil_destroy( + struct xlog *log) +{ + if (log->l_cilp->xc_ctx) { + if (log->l_cilp->xc_ctx->ticket) + xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket); + kmem_free(log->l_cilp->xc_ctx); + } + + ASSERT(list_empty(&log->l_cilp->xc_cil)); + kmem_free(log->l_cilp); +} + diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h new file mode 100644 index 00000000000..f0969c77bdb --- /dev/null +++ b/fs/xfs/xfs_log_format.h @@ -0,0 +1,679 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_LOG_FORMAT_H__ +#define __XFS_LOG_FORMAT_H__ + +struct xfs_mount; +struct xfs_trans_res; + +/* + * On-disk Log Format definitions. + * + * This file contains all the on-disk format definitions used within the log. It + * includes the physical log structure itself, as well as all the log item + * format structures that are written into the log and intepreted by log + * recovery. We start with the physical log format definitions, and then work + * through all the log items definitions and everything they encode into the + * log. + */ +typedef __uint32_t xlog_tid_t; + +#define XLOG_MIN_ICLOGS 2 +#define XLOG_MAX_ICLOGS 8 +#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ +#define XLOG_VERSION_1 1 +#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ +#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) +#define XLOG_MIN_RECORD_BSIZE (16*1024) /* eventually 32k */ +#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ +#define XLOG_MAX_RECORD_BSIZE (256*1024) +#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ +#define XLOG_MIN_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ +#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ +#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ +#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ + (log)->l_mp->m_sb.sb_logsunit) +#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) + +#define XLOG_HEADER_SIZE 512 + +/* Minimum number of transactions that must fit in the log (defined by mkfs) */ +#define XFS_MIN_LOG_FACTOR 3 + +#define XLOG_REC_SHIFT(log) \ + BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) +#define XLOG_TOTAL_REC_SHIFT(log) \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) + +/* get lsn fields */ +#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) +#define BLOCK_LSN(lsn) ((uint)(lsn)) + +/* this is used in a spot where we might otherwise double-endian-flip */ +#define CYCLE_LSN_DISK(lsn) (((__be32 *)&(lsn))[0]) + +static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) +{ + return ((xfs_lsn_t)cycle << 32) | block; +} + +static inline uint xlog_get_cycle(char *ptr) +{ + if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) + return be32_to_cpu(*((__be32 *)ptr + 1)); + else + return be32_to_cpu(*(__be32 *)ptr); +} + +/* Log Clients */ +#define XFS_TRANSACTION 0x69 +#define XFS_VOLUME 0x2 +#define XFS_LOG 0xaa + +#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ + +/* Region types for iovec's i_type */ +#define XLOG_REG_TYPE_BFORMAT 1 +#define XLOG_REG_TYPE_BCHUNK 2 +#define XLOG_REG_TYPE_EFI_FORMAT 3 +#define XLOG_REG_TYPE_EFD_FORMAT 4 +#define XLOG_REG_TYPE_IFORMAT 5 +#define XLOG_REG_TYPE_ICORE 6 +#define XLOG_REG_TYPE_IEXT 7 +#define XLOG_REG_TYPE_IBROOT 8 +#define XLOG_REG_TYPE_ILOCAL 9 +#define XLOG_REG_TYPE_IATTR_EXT 10 +#define XLOG_REG_TYPE_IATTR_BROOT 11 +#define XLOG_REG_TYPE_IATTR_LOCAL 12 +#define XLOG_REG_TYPE_QFORMAT 13 +#define XLOG_REG_TYPE_DQUOT 14 +#define XLOG_REG_TYPE_QUOTAOFF 15 +#define XLOG_REG_TYPE_LRHEADER 16 +#define XLOG_REG_TYPE_UNMOUNT 17 +#define XLOG_REG_TYPE_COMMIT 18 +#define XLOG_REG_TYPE_TRANSHDR 19 +#define XLOG_REG_TYPE_ICREATE 20 +#define XLOG_REG_TYPE_MAX 20 + +/* + * Flags to log operation header + * + * The first write of a new transaction will be preceded with a start + * record, XLOG_START_TRANS. Once a transaction is committed, a commit + * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into + * the remainder of the current active in-core log, it is split up into + * multiple regions. Each partial region will be marked with a + * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. + * + */ +#define XLOG_START_TRANS 0x01 /* Start a new transaction */ +#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ +#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ +#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ +#define XLOG_END_TRANS 0x10 /* End a continued transaction */ +#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ + + +typedef struct xlog_op_header { + __be32 oh_tid; /* transaction id of operation : 4 b */ + __be32 oh_len; /* bytes in data region : 4 b */ + __u8 oh_clientid; /* who sent me this : 1 b */ + __u8 oh_flags; /* : 1 b */ + __u16 oh_res2; /* 32 bit align : 2 b */ +} xlog_op_header_t; + +/* valid values for h_fmt */ +#define XLOG_FMT_UNKNOWN 0 +#define XLOG_FMT_LINUX_LE 1 +#define XLOG_FMT_LINUX_BE 2 +#define XLOG_FMT_IRIX_BE 3 + +/* our fmt */ +#ifdef XFS_NATIVE_HOST +#define XLOG_FMT XLOG_FMT_LINUX_BE +#else +#define XLOG_FMT XLOG_FMT_LINUX_LE +#endif + +typedef struct xlog_rec_header { + __be32 h_magicno; /* log record (LR) identifier : 4 */ + __be32 h_cycle; /* write cycle of log : 4 */ + __be32 h_version; /* LR version : 4 */ + __be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */ + __be64 h_lsn; /* lsn of this LR : 8 */ + __be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ + __le32 h_crc; /* crc of log record : 4 */ + __be32 h_prev_block; /* block number to previous LR : 4 */ + __be32 h_num_logops; /* number of log operations in this LR : 4 */ + __be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; + /* new fields */ + __be32 h_fmt; /* format of log record : 4 */ + uuid_t h_fs_uuid; /* uuid of FS : 16 */ + __be32 h_size; /* iclog size : 4 */ +} xlog_rec_header_t; + +typedef struct xlog_rec_ext_header { + __be32 xh_cycle; /* write cycle of log : 4 */ + __be32 xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ +} xlog_rec_ext_header_t; + +/* + * Quite misnamed, because this union lays out the actual on-disk log buffer. + */ +typedef union xlog_in_core2 { + xlog_rec_header_t hic_header; + xlog_rec_ext_header_t hic_xheader; + char hic_sector[XLOG_HEADER_SIZE]; +} xlog_in_core_2_t; + +/* not an on-disk structure, but needed by log recovery in userspace */ +typedef struct xfs_log_iovec { + void *i_addr; /* beginning address of region */ + int i_len; /* length in bytes of region */ + uint i_type; /* type of region */ +} xfs_log_iovec_t; + + +/* + * Transaction Header definitions. + * + * This is the structure written in the log at the head of every transaction. It + * identifies the type and id of the transaction, and contains the number of + * items logged by the transaction so we know how many to expect during + * recovery. + * + * Do not change the below structure without redoing the code in + * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). + */ +typedef struct xfs_trans_header { + uint th_magic; /* magic number */ + uint th_type; /* transaction type */ + __int32_t th_tid; /* transaction id (unused) */ + uint th_num_items; /* num items logged by trans */ +} xfs_trans_header_t; + +#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ + +/* + * Log item types. + */ +#define XFS_LI_EFI 0x1236 +#define XFS_LI_EFD 0x1237 +#define XFS_LI_IUNLINK 0x1238 +#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ +#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ +#define XFS_LI_DQUOT 0x123d +#define XFS_LI_QUOTAOFF 0x123e +#define XFS_LI_ICREATE 0x123f + +#define XFS_LI_TYPE_DESC \ + { XFS_LI_EFI, "XFS_LI_EFI" }, \ + { XFS_LI_EFD, "XFS_LI_EFD" }, \ + { XFS_LI_IUNLINK, "XFS_LI_IUNLINK" }, \ + { XFS_LI_INODE, "XFS_LI_INODE" }, \ + { XFS_LI_BUF, "XFS_LI_BUF" }, \ + { XFS_LI_DQUOT, "XFS_LI_DQUOT" }, \ + { XFS_LI_QUOTAOFF, "XFS_LI_QUOTAOFF" }, \ + { XFS_LI_ICREATE, "XFS_LI_ICREATE" } + +/* + * Inode Log Item Format definitions. + * + * This is the structure used to lay out an inode log item in the + * log. The size of the inline data/extents/b-tree root to be logged + * (if any) is indicated in the ilf_dsize field. Changes to this structure + * must be added on to the end. + */ +typedef struct xfs_inode_log_format { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_t; + +typedef struct xfs_inode_log_format_32 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} __attribute__((packed)) xfs_inode_log_format_32_t; + +typedef struct xfs_inode_log_format_64 { + __uint16_t ilf_type; /* inode log item type */ + __uint16_t ilf_size; /* size of this item */ + __uint32_t ilf_fields; /* flags for fields logged */ + __uint16_t ilf_asize; /* size of attr d/ext/root */ + __uint16_t ilf_dsize; /* size of data/ext/root */ + __uint32_t ilf_pad; /* pad for 64 bit boundary */ + __uint64_t ilf_ino; /* inode number */ + union { + __uint32_t ilfu_rdev; /* rdev value for dev inode*/ + uuid_t ilfu_uuid; /* mount point value */ + } ilf_u; + __int64_t ilf_blkno; /* blkno of inode buffer */ + __int32_t ilf_len; /* len of inode buffer */ + __int32_t ilf_boffset; /* off of inode in buffer */ +} xfs_inode_log_format_64_t; + +/* + * Flags for xfs_trans_log_inode flags field. + */ +#define XFS_ILOG_CORE 0x001 /* log standard inode fields */ +#define XFS_ILOG_DDATA 0x002 /* log i_df.if_data */ +#define XFS_ILOG_DEXT 0x004 /* log i_df.if_extents */ +#define XFS_ILOG_DBROOT 0x008 /* log i_df.i_broot */ +#define XFS_ILOG_DEV 0x010 /* log the dev field */ +#define XFS_ILOG_UUID 0x020 /* log the uuid field */ +#define XFS_ILOG_ADATA 0x040 /* log i_af.if_data */ +#define XFS_ILOG_AEXT 0x080 /* log i_af.if_extents */ +#define XFS_ILOG_ABROOT 0x100 /* log i_af.i_broot */ +#define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ +#define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ + + +/* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk + * in the ilf_fields of the inode_log_format, but is purely store in-memory in + * ili_fields in the inode_log_item. + */ +#define XFS_ILOG_TIMESTAMP 0x4000 + +#define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_UUID | XFS_ILOG_ADATA | \ + XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +#define XFS_ILOG_DFORK (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT) + +#define XFS_ILOG_AFORK (XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT) + +#define XFS_ILOG_ALL (XFS_ILOG_CORE | XFS_ILOG_DDATA | \ + XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \ + XFS_ILOG_DEV | XFS_ILOG_UUID | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ + XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \ + XFS_ILOG_DOWNER | XFS_ILOG_AOWNER) + +static inline int xfs_ilog_fbroot(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT); +} + +static inline int xfs_ilog_fext(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); +} + +static inline int xfs_ilog_fdata(int w) +{ + return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA); +} + +/* + * Incore version of the on-disk inode core structures. We log this directly + * into the journal in host CPU format (for better or worse) and as such + * directly mirrors the xfs_dinode structure as it must contain all the same + * information. + */ +typedef struct xfs_ictimestamp { + __int32_t t_sec; /* timestamp seconds */ + __int32_t t_nsec; /* timestamp nanoseconds */ +} xfs_ictimestamp_t; + +/* + * NOTE: This structure must be kept identical to struct xfs_dinode + * in xfs_dinode.h except for the endianness annotations. + */ +typedef struct xfs_icdinode { + __uint16_t di_magic; /* inode magic # = XFS_DINODE_MAGIC */ + __uint16_t di_mode; /* mode and type of file */ + __int8_t di_version; /* inode version */ + __int8_t di_format; /* format of di_c data */ + __uint16_t di_onlink; /* old number of links to file */ + __uint32_t di_uid; /* owner's user id */ + __uint32_t di_gid; /* owner's group id */ + __uint32_t di_nlink; /* number of links to file */ + __uint16_t di_projid_lo; /* lower part of owner's project id */ + __uint16_t di_projid_hi; /* higher part of owner's project id */ + __uint8_t di_pad[6]; /* unused, zeroed space */ + __uint16_t di_flushiter; /* incremented on flush */ + xfs_ictimestamp_t di_atime; /* time last accessed */ + xfs_ictimestamp_t di_mtime; /* time last modified */ + xfs_ictimestamp_t di_ctime; /* time created/inode modified */ + xfs_fsize_t di_size; /* number of bytes in file */ + xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ + xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ + xfs_extnum_t di_nextents; /* number of extents in data fork */ + xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ + __uint8_t di_forkoff; /* attr fork offs, <<3 for 64b align */ + __int8_t di_aformat; /* format of attr fork's data */ + __uint32_t di_dmevmask; /* DMIG event mask */ + __uint16_t di_dmstate; /* DMIG state info */ + __uint16_t di_flags; /* random flags, XFS_DIFLAG_... */ + __uint32_t di_gen; /* generation number */ + + /* di_next_unlinked is the only non-core field in the old dinode */ + xfs_agino_t di_next_unlinked;/* agi unlinked list ptr */ + + /* start of the extended dinode, writable fields */ + __uint32_t di_crc; /* CRC of the inode */ + __uint64_t di_changecount; /* number of attribute changes */ + xfs_lsn_t di_lsn; /* flush sequence */ + __uint64_t di_flags2; /* more random flags */ + __uint8_t di_pad2[16]; /* more padding for future expansion */ + + /* fields only written to during inode creation */ + xfs_ictimestamp_t di_crtime; /* time created */ + xfs_ino_t di_ino; /* inode number */ + uuid_t di_uuid; /* UUID of the filesystem */ + + /* structure must be padded to 64 bit alignment */ +} xfs_icdinode_t; + +static inline uint xfs_icdinode_size(int version) +{ + if (version == 3) + return sizeof(struct xfs_icdinode); + return offsetof(struct xfs_icdinode, di_next_unlinked); +} + +/* + * Buffer Log Format defintions + * + * These are the physical dirty bitmap defintions for the log format structure. + */ +#define XFS_BLF_CHUNK 128 +#define XFS_BLF_SHIFT 7 +#define BIT_TO_WORD_SHIFT 5 +#define NBWORD (NBBY * sizeof(unsigned int)) + +/* + * This flag indicates that the buffer contains on disk inodes + * and requires special recovery handling. + */ +#define XFS_BLF_INODE_BUF (1<<0) + +/* + * This flag indicates that the buffer should not be replayed + * during recovery because its blocks are being freed. + */ +#define XFS_BLF_CANCEL (1<<1) + +/* + * This flag indicates that the buffer contains on disk + * user or group dquots and may require special recovery handling. + */ +#define XFS_BLF_UDQUOT_BUF (1<<2) +#define XFS_BLF_PDQUOT_BUF (1<<3) +#define XFS_BLF_GDQUOT_BUF (1<<4) + +/* + * This is the structure used to lay out a buf log item in the + * log. The data map describes which 128 byte chunks of the buffer + * have been logged. + */ +#define XFS_BLF_DATAMAP_SIZE ((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) / NBWORD) + +typedef struct xfs_buf_log_format { + unsigned short blf_type; /* buf log item type indicator */ + unsigned short blf_size; /* size of this item */ + ushort blf_flags; /* misc state */ + ushort blf_len; /* number of blocks in this buf */ + __int64_t blf_blkno; /* starting blkno of this buf */ + unsigned int blf_map_size; /* used size of data bitmap in words */ + unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ +} xfs_buf_log_format_t; + +/* + * All buffers now need to tell recovery where the magic number + * is so that it can verify and calculate the CRCs on the buffer correctly + * once the changes have been replayed into the buffer. + * + * The type value is held in the upper 5 bits of the blf_flags field, which is + * an unsigned 16 bit field. Hence we need to shift it 11 bits up and down. + */ +#define XFS_BLFT_BITS 5 +#define XFS_BLFT_SHIFT 11 +#define XFS_BLFT_MASK (((1 << XFS_BLFT_BITS) - 1) << XFS_BLFT_SHIFT) + +enum xfs_blft { + XFS_BLFT_UNKNOWN_BUF = 0, + XFS_BLFT_UDQUOT_BUF, + XFS_BLFT_PDQUOT_BUF, + XFS_BLFT_GDQUOT_BUF, + XFS_BLFT_BTREE_BUF, + XFS_BLFT_AGF_BUF, + XFS_BLFT_AGFL_BUF, + XFS_BLFT_AGI_BUF, + XFS_BLFT_DINO_BUF, + XFS_BLFT_SYMLINK_BUF, + XFS_BLFT_DIR_BLOCK_BUF, + XFS_BLFT_DIR_DATA_BUF, + XFS_BLFT_DIR_FREE_BUF, + XFS_BLFT_DIR_LEAF1_BUF, + XFS_BLFT_DIR_LEAFN_BUF, + XFS_BLFT_DA_NODE_BUF, + XFS_BLFT_ATTR_LEAF_BUF, + XFS_BLFT_ATTR_RMT_BUF, + XFS_BLFT_SB_BUF, + XFS_BLFT_MAX_BUF = (1 << XFS_BLFT_BITS), +}; + +static inline void +xfs_blft_to_flags(struct xfs_buf_log_format *blf, enum xfs_blft type) +{ + ASSERT(type > XFS_BLFT_UNKNOWN_BUF && type < XFS_BLFT_MAX_BUF); + blf->blf_flags &= ~XFS_BLFT_MASK; + blf->blf_flags |= ((type << XFS_BLFT_SHIFT) & XFS_BLFT_MASK); +} + +static inline __uint16_t +xfs_blft_from_flags(struct xfs_buf_log_format *blf) +{ + return (blf->blf_flags & XFS_BLFT_MASK) >> XFS_BLFT_SHIFT; +} + +/* + * EFI/EFD log format definitions + */ +typedef struct xfs_extent { + xfs_dfsbno_t ext_start; + xfs_extlen_t ext_len; +} xfs_extent_t; + +/* + * Since an xfs_extent_t has types (start:64, len: 32) + * there are different alignments on 32 bit and 64 bit kernels. + * So we provide the different variants for use by a + * conversion routine. + */ +typedef struct xfs_extent_32 { + __uint64_t ext_start; + __uint32_t ext_len; +} __attribute__((packed)) xfs_extent_32_t; + +typedef struct xfs_extent_64 { + __uint64_t ext_start; + __uint32_t ext_len; + __uint32_t ext_pad; +} xfs_extent_64_t; + +/* + * This is the structure used to lay out an efi log item in the + * log. The efi_extents field is a variable size array whose + * size is given by efi_nextents. + */ +typedef struct xfs_efi_log_format { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_t; + +typedef struct xfs_efi_log_format_32 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_32_t efi_extents[1]; /* array of extents to free */ +} __attribute__((packed)) xfs_efi_log_format_32_t; + +typedef struct xfs_efi_log_format_64 { + __uint16_t efi_type; /* efi log item type */ + __uint16_t efi_size; /* size of this item */ + __uint32_t efi_nextents; /* # extents to free */ + __uint64_t efi_id; /* efi identifier */ + xfs_extent_64_t efi_extents[1]; /* array of extents to free */ +} xfs_efi_log_format_64_t; + +/* + * This is the structure used to lay out an efd log item in the + * log. The efd_extents array is a variable size array whose + * size is given by efd_nextents; + */ +typedef struct xfs_efd_log_format { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_t; + +typedef struct xfs_efd_log_format_32 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_32_t efd_extents[1]; /* array of extents freed */ +} __attribute__((packed)) xfs_efd_log_format_32_t; + +typedef struct xfs_efd_log_format_64 { + __uint16_t efd_type; /* efd log item type */ + __uint16_t efd_size; /* size of this item */ + __uint32_t efd_nextents; /* # of extents freed */ + __uint64_t efd_efi_id; /* id of corresponding efi */ + xfs_extent_64_t efd_extents[1]; /* array of extents freed */ +} xfs_efd_log_format_64_t; + +/* + * Dquot Log format definitions. + * + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + */ +typedef struct xfs_dq_logformat { + __uint16_t qlf_type; /* dquot log item type */ + __uint16_t qlf_size; /* size of this item */ + xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ + __int64_t qlf_blkno; /* blkno of dquot buffer */ + __int32_t qlf_len; /* len of dquot buffer */ + __uint32_t qlf_boffset; /* off of dquot in buffer */ +} xfs_dq_logformat_t; + +/* + * log format struct for QUOTAOFF records. + * The first two fields must be the type and size fitting into + * 32 bits : log_recovery code assumes that. + * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer + * to the first and ensures that the first logitem is taken out of the AIL + * only when the last one is securely committed. + */ +typedef struct xfs_qoff_logformat { + unsigned short qf_type; /* quotaoff log item type */ + unsigned short qf_size; /* size of this item */ + unsigned int qf_flags; /* USR and/or GRP */ + char qf_pad[12]; /* padding for future */ +} xfs_qoff_logformat_t; + +/* + * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. + */ +#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ +#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ +#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ +#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ +#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ +#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ +#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ + +/* + * Conversion to and from the combined OQUOTA flag (if necessary) + * is done only in xfs_sb_qflags_to_disk() and xfs_sb_qflags_from_disk() + */ +#define XFS_GQUOTA_ENFD 0x0080 /* group quota limits enforced */ +#define XFS_GQUOTA_CHKD 0x0100 /* quotacheck run on group quotas */ +#define XFS_PQUOTA_ENFD 0x0200 /* project quota limits enforced */ +#define XFS_PQUOTA_CHKD 0x0400 /* quotacheck run on project quotas */ + +#define XFS_ALL_QUOTA_ACCT \ + (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) +#define XFS_ALL_QUOTA_ENFD \ + (XFS_UQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_ENFD) +#define XFS_ALL_QUOTA_CHKD \ + (XFS_UQUOTA_CHKD | XFS_GQUOTA_CHKD | XFS_PQUOTA_CHKD) + +#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ + XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ + XFS_GQUOTA_ENFD|XFS_GQUOTA_CHKD|\ + XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD|\ + XFS_PQUOTA_CHKD) + +/* + * Inode create log item structure + * + * Log recovery assumes the first two entries are the type and size and they fit + * in 32 bits. Also in host order (ugh) so they have to be 32 bit aligned so + * decoding can be done correctly. + */ +struct xfs_icreate_log { + __uint16_t icl_type; /* type of log format structure */ + __uint16_t icl_size; /* size of log format structure */ + __be32 icl_ag; /* ag being allocated in */ + __be32 icl_agbno; /* start block of inode range */ + __be32 icl_count; /* number of inodes to initialise */ + __be32 icl_isize; /* size of inodes */ + __be32 icl_length; /* length of extent to initialise */ + __be32 icl_gen; /* inode generation number to use */ +}; + +#endif /* __XFS_LOG_FORMAT_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 34bcbf50789..9bc403a9e54 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -19,72 +19,19 @@ #define __XFS_LOG_PRIV_H__ struct xfs_buf; -struct ktrace; -struct log; +struct xlog; struct xlog_ticket; -struct xfs_buf_cancel; struct xfs_mount; +struct xfs_log_callback; /* - * Macros, structures, prototypes for internal log manager use. - */ - -#define XLOG_MIN_ICLOGS 2 -#define XLOG_MED_ICLOGS 4 -#define XLOG_MAX_ICLOGS 8 -#define XLOG_CALLBACK_SIZE 10 -#define XLOG_HEADER_MAGIC_NUM 0xFEEDbabe /* Invalid cycle number */ -#define XLOG_VERSION_1 1 -#define XLOG_VERSION_2 2 /* Large IClogs, Log sunit */ -#define XLOG_VERSION_OKBITS (XLOG_VERSION_1 | XLOG_VERSION_2) -#define XLOG_RECORD_BSIZE (16*1024) /* eventually 32k */ -#define XLOG_BIG_RECORD_BSIZE (32*1024) /* 32k buffers */ -#define XLOG_MAX_RECORD_BSIZE (256*1024) -#define XLOG_HEADER_CYCLE_SIZE (32*1024) /* cycle data in header */ -#define XLOG_RECORD_BSHIFT 14 /* 16384 == 1 << 14 */ -#define XLOG_BIG_RECORD_BSHIFT 15 /* 32k == 1 << 15 */ -#define XLOG_MAX_RECORD_BSHIFT 18 /* 256k == 1 << 18 */ -#define XLOG_BTOLSUNIT(log, b) (((b)+(log)->l_mp->m_sb.sb_logsunit-1) / \ - (log)->l_mp->m_sb.sb_logsunit) -#define XLOG_LSUNITTOB(log, su) ((su) * (log)->l_mp->m_sb.sb_logsunit) - -#define XLOG_HEADER_SIZE 512 - -#define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) -#define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? \ - XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) - -/* - * set lsns + * Flags for log structure */ - -#define ASSIGN_ANY_LSN_HOST(lsn,cycle,block) \ - { \ - (lsn) = ((xfs_lsn_t)(cycle)<<32)|(block); \ - } -#define ASSIGN_ANY_LSN_DISK(lsn,cycle,block) \ - { \ - INT_SET(((uint *)&(lsn))[0], ARCH_CONVERT, (cycle)); \ - INT_SET(((uint *)&(lsn))[1], ARCH_CONVERT, (block)); \ - } -#define ASSIGN_LSN(lsn,log) \ - ASSIGN_ANY_LSN_DISK(lsn,(log)->l_curr_cycle,(log)->l_curr_block); - -#define XLOG_SET(f,b) (((f) & (b)) == (b)) - -#define GET_CYCLE(ptr, arch) \ - (INT_GET(*(uint *)(ptr), arch) == XLOG_HEADER_MAGIC_NUM ? \ - INT_GET(*((uint *)(ptr)+1), arch) : \ - INT_GET(*(uint *)(ptr), arch) \ - ) - -#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) - - -#ifdef __KERNEL__ +#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ +#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ /* * get client id from packed copy. @@ -98,23 +45,10 @@ struct xfs_mount; * * this has endian issues, of course. */ - -#ifndef XFS_NATIVE_HOST -#define GET_CLIENT_ID(i,arch) \ - ((i) & 0xff) -#else -#define GET_CLIENT_ID(i,arch) \ - ((i) >> 24) -#endif - -#define GRANT_LOCK(log) mutex_spinlock(&(log)->l_grant_lock) -#define GRANT_UNLOCK(log, s) mutex_spinunlock(&(log)->l_grant_lock, s) -#define LOG_LOCK(log) mutex_spinlock(&(log)->l_icloglock) -#define LOG_UNLOCK(log, s) mutex_spinunlock(&(log)->l_icloglock, s) - -#define xlog_panic(args...) cmn_err(CE_PANIC, ## args) -#define xlog_exit(args...) cmn_err(CE_PANIC, ## args) -#define xlog_warn(args...) cmn_err(CE_WARN, ## args) +static inline uint xlog_get_client_id(__be32 i) +{ + return be32_to_cpu(i) >> 24; +} /* * In core log state @@ -130,52 +64,17 @@ struct xfs_mount; #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ -#endif /* __KERNEL__ */ /* - * Flags to log operation header - * - * The first write of a new transaction will be preceded with a start - * record, XLOG_START_TRANS. Once a transaction is committed, a commit - * record is written, XLOG_COMMIT_TRANS. If a single region can not fit into - * the remainder of the current active in-core log, it is split up into - * multiple regions. Each partial region will be marked with a - * XLOG_CONTINUE_TRANS until the last one, which gets marked with XLOG_END_TRANS. - * - */ -#define XLOG_START_TRANS 0x01 /* Start a new transaction */ -#define XLOG_COMMIT_TRANS 0x02 /* Commit this transaction */ -#define XLOG_CONTINUE_TRANS 0x04 /* Cont this trans into new region */ -#define XLOG_WAS_CONT_TRANS 0x08 /* Cont this trans into new region */ -#define XLOG_END_TRANS 0x10 /* End a continued transaction */ -#define XLOG_UNMOUNT_TRANS 0x20 /* Unmount a filesystem transaction */ -#define XLOG_SKIP_TRANS (XLOG_COMMIT_TRANS | XLOG_CONTINUE_TRANS | \ - XLOG_WAS_CONT_TRANS | XLOG_END_TRANS | \ - XLOG_UNMOUNT_TRANS) - -#ifdef __KERNEL__ -/* * Flags to log ticket */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ -#define XLOG_TIC_IN_Q 0x4 -#endif /* __KERNEL__ */ - -#define XLOG_UNMOUNT_TYPE 0x556e /* Un for Unmount */ - -/* - * Flags for log structure - */ -#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */ -#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ -#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ -#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being - shutdown */ -typedef __uint32_t xlog_tid_t; +#define XLOG_TIC_FLAGS \ + { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ + { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } -#ifdef __KERNEL__ /* * Below are states for covering allocation transactions. * By covering, we mean changing the h_tail_lsn in the last on-disk @@ -251,25 +150,8 @@ typedef __uint32_t xlog_tid_t; #define XLOG_COVER_OPS 5 - /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 -#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \ - (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0) -#define XLOG_TIC_ADD_OPHDR(t) ((t)->t_res_num_ophdrs++) -#define XLOG_TIC_ADD_REGION(t, len, type) \ - do { \ - if ((t)->t_res_num == XLOG_TIC_LEN_MAX) { \ - /* add to overflow and start again */ \ - (t)->t_res_o_flow += (t)->t_res_arr_sum; \ - (t)->t_res_num = 0; \ - (t)->t_res_arr_sum = 0; \ - } \ - (t)->t_res_arr[(t)->t_res_num].r_len = (len); \ - (t)->t_res_arr[(t)->t_res_num].r_type = (type); \ - (t)->t_res_arr_sum += (len); \ - (t)->t_res_num++; \ - } while (0) /* * Reservation region @@ -282,10 +164,10 @@ typedef struct xlog_res { } xlog_res_t; typedef struct xlog_ticket { - sv_t t_sema; /* sleep on this semaphore : 20 */ - struct xlog_ticket *t_next; /* :4|8 */ - struct xlog_ticket *t_prev; /* :4|8 */ + struct list_head t_queue; /* reserve/write queue */ + struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier : 4 */ + atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ @@ -302,60 +184,12 @@ typedef struct xlog_ticket { xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; -#endif - - -typedef struct xlog_op_header { - xlog_tid_t oh_tid; /* transaction id of operation : 4 b */ - int oh_len; /* bytes in data region : 4 b */ - __uint8_t oh_clientid; /* who sent me this : 1 b */ - __uint8_t oh_flags; /* : 1 b */ - ushort oh_res2; /* 32 bit align : 2 b */ -} xlog_op_header_t; - - -/* valid values for h_fmt */ -#define XLOG_FMT_UNKNOWN 0 -#define XLOG_FMT_LINUX_LE 1 -#define XLOG_FMT_LINUX_BE 2 -#define XLOG_FMT_IRIX_BE 3 - -/* our fmt */ -#ifdef XFS_NATIVE_HOST -#define XLOG_FMT XLOG_FMT_LINUX_BE -#else -#define XLOG_FMT XLOG_FMT_LINUX_LE -#endif - -typedef struct xlog_rec_header { - uint h_magicno; /* log record (LR) identifier : 4 */ - uint h_cycle; /* write cycle of log : 4 */ - int h_version; /* LR version : 4 */ - int h_len; /* len in bytes; should be 64-bit aligned: 4 */ - xfs_lsn_t h_lsn; /* lsn of this LR : 8 */ - xfs_lsn_t h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */ - uint h_chksum; /* may not be used; non-zero if used : 4 */ - int h_prev_block; /* block number to previous LR : 4 */ - int h_num_logops; /* number of log operations in this LR : 4 */ - uint h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; - /* new fields */ - int h_fmt; /* format of log record : 4 */ - uuid_t h_fs_uuid; /* uuid of FS : 16 */ - int h_size; /* iclog size : 4 */ -} xlog_rec_header_t; - -typedef struct xlog_rec_ext_header { - uint xh_cycle; /* write cycle of log : 4 */ - uint xh_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE]; /* : 256 */ -} xlog_rec_ext_header_t; - -#ifdef __KERNEL__ /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. * - ic_data follows, so a write to disk can start at the beginning of * the iclog. - * - ic_forcesema is used to implement synchronous forcing of the iclog to disk. + * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. @@ -365,57 +199,156 @@ typedef struct xlog_rec_ext_header { * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. + * + * Because of cacheline contention on large machines, we need to separate + * various resources onto different cachelines. To start with, make the + * structure cacheline aligned. The following fields can be contended on + * by independent processes: + * + * - ic_callback_* + * - ic_refcnt + * - fields protected by the global l_icloglock + * + * so we need to ensure that these fields are located in separate cachelines. + * We'll put all the read-only and l_icloglock fields in the first cacheline, + * and move everything else out to subsequent cachelines. */ -typedef struct xlog_iclog_fields { - sv_t ic_forcesema; - sv_t ic_writesema; +typedef struct xlog_in_core { + wait_queue_head_t ic_force_wait; + wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; - struct log *ic_log; - xfs_log_callback_t *ic_callback; - xfs_log_callback_t **ic_callback_tail; -#ifdef XFS_LOG_TRACE - struct ktrace *ic_trace; -#endif + struct xlog *ic_log; int ic_size; int ic_offset; - int ic_refcnt; int ic_bwritecnt; - ushort_t ic_state; + unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ -} xlog_iclog_fields_t; -typedef union xlog_in_core2 { - xlog_rec_header_t hic_header; - xlog_rec_ext_header_t hic_xheader; - char hic_sector[XLOG_HEADER_SIZE]; -} xlog_in_core_2_t; + /* Callback structures need their own cacheline */ + spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; + struct xfs_log_callback *ic_callback; + struct xfs_log_callback **ic_callback_tail; -typedef struct xlog_in_core { - xlog_iclog_fields_t hic_fields; - xlog_in_core_2_t *hic_data; + /* reference counts need their own cacheline */ + atomic_t ic_refcnt ____cacheline_aligned_in_smp; + xlog_in_core_2_t *ic_data; +#define ic_header ic_data->hic_header } xlog_in_core_t; /* - * Defines to save our code from this glop. + * The CIL context is used to aggregate per-transaction details as well be + * passed to the iclog for checkpoint post-commit processing. After being + * passed to the iclog, another context needs to be allocated for tracking the + * next set of transactions to be aggregated into a checkpoint. + */ +struct xfs_cil; + +struct xfs_cil_ctx { + struct xfs_cil *cil; + xfs_lsn_t sequence; /* chkpt sequence # */ + xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ + xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_ticket *ticket; /* chkpt ticket */ + int nvecs; /* number of regions */ + int space_used; /* aggregate size of regions */ + struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_log_vec *lv_chain; /* logvecs being pushed */ + struct xfs_log_callback log_cb; /* completion callback hook. */ + struct list_head committing; /* ctx committing list */ +}; + +/* + * Committed Item List structure + * + * This structure is used to track log items that have been committed but not + * yet written into the log. It is used only when the delayed logging mount + * option is enabled. + * + * This structure tracks the list of committing checkpoint contexts so + * we can avoid the problem of having to hold out new transactions during a + * flush until we have a the commit record LSN of the checkpoint. We can + * traverse the list of committing contexts in xlog_cil_push_lsn() to find a + * sequence match and extract the commit LSN directly from there. If the + * checkpoint is still in the process of committing, we can block waiting for + * the commit LSN to be determined as well. This should make synchronous + * operations almost as efficient as the old logging methods. + */ +struct xfs_cil { + struct xlog *xc_log; + struct list_head xc_cil; + spinlock_t xc_cil_lock; + + struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; + struct xfs_cil_ctx *xc_ctx; + + spinlock_t xc_push_lock ____cacheline_aligned_in_smp; + xfs_lsn_t xc_push_seq; + struct list_head xc_committing; + wait_queue_head_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; + struct work_struct xc_push_work; +} ____cacheline_aligned_in_smp; + +/* + * The amount of log space we allow the CIL to aggregate is difficult to size. + * Whatever we choose, we have to make sure we can get a reservation for the + * log space effectively, that it is large enough to capture sufficient + * relogging to reduce log buffer IO significantly, but it is not too large for + * the log or induces too much latency when writing out through the iclogs. We + * track both space consumed and the number of vectors in the checkpoint + * context, so we need to decide which to use for limiting. + * + * Every log buffer we write out during a push needs a header reserved, which + * is at least one sector and more for v2 logs. Hence we need a reservation of + * at least 512 bytes per 32k of log space just for the LR headers. That means + * 16KB of reservation per megabyte of delayed logging space we will consume, + * plus various headers. The number of headers will vary based on the num of + * io vectors, so limiting on a specific number of vectors is going to result + * in transactions of varying size. IOWs, it is more consistent to track and + * limit space consumed in the log rather than by the number of objects being + * logged in order to prevent checkpoint ticket overruns. + * + * Further, use of static reservations through the log grant mechanism is + * problematic. It introduces a lot of complexity (e.g. reserve grant vs write + * grant) and a significant deadlock potential because regranting write space + * can block on log pushes. Hence if we have to regrant log space during a log + * push, we can deadlock. + * + * However, we can avoid this by use of a dynamic "reservation stealing" + * technique during transaction commit whereby unused reservation space in the + * transaction ticket is transferred to the CIL ctx commit ticket to cover the + * space needed by the checkpoint transaction. This means that we never need to + * specifically reserve space for the CIL checkpoint transaction, nor do we + * need to regrant space once the checkpoint completes. This also means the + * checkpoint transaction ticket is specific to the checkpoint context, rather + * than the CIL itself. + * + * With dynamic reservations, we can effectively make up arbitrary limits for + * the checkpoint size so long as they don't violate any other size rules. + * Recovery imposes a rule that no transaction exceed half the log, so we are + * limited by that. Furthermore, the log transaction reservation subsystem + * tries to keep 25% of the log free, so we need to keep below that limit or we + * risk running out of free log space to start any new transactions. + * + * In order to keep background CIL push efficient, we will set a lower + * threshold at which background pushing is attempted without blocking current + * transaction commits. A separate, higher bound defines when CIL pushes are + * enforced to ensure we stay within our maximum checkpoint size bounds. + * threshold, yet give us plenty of space for aggregation on large logs. + */ +#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) + +/* + * ticket grant locks, queues and accounting have their own cachlines + * as these are quite hot and can be operated on concurrently. */ -#define ic_forcesema hic_fields.ic_forcesema -#define ic_writesema hic_fields.ic_writesema -#define ic_next hic_fields.ic_next -#define ic_prev hic_fields.ic_prev -#define ic_bp hic_fields.ic_bp -#define ic_log hic_fields.ic_log -#define ic_callback hic_fields.ic_callback -#define ic_callback_tail hic_fields.ic_callback_tail -#define ic_trace hic_fields.ic_trace -#define ic_size hic_fields.ic_size -#define ic_offset hic_fields.ic_offset -#define ic_refcnt hic_fields.ic_refcnt -#define ic_bwritecnt hic_fields.ic_bwritecnt -#define ic_state hic_fields.ic_state -#define ic_datap hic_fields.ic_datap -#define ic_header hic_data->hic_header +struct xlog_grant_head { + spinlock_t lock ____cacheline_aligned_in_smp; + struct list_head waiters; + atomic64_t grant; +}; /* * The reservation head lsn is not made up of a cycle number and block number. @@ -423,89 +356,204 @@ typedef struct xlog_in_core { * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ -typedef struct log { - /* The following block of fields are changed while holding icloglock */ - sema_t l_flushsema; /* iclog flushing semaphore */ - int l_flushcnt; /* # of procs waiting on this - * sema */ - int l_ticket_cnt; /* free ticket count */ - int l_ticket_tcnt; /* total ticket count */ - int l_covered_state;/* state of "covering disk - * log entries" */ - xlog_ticket_t *l_freelist; /* free list of tickets */ - xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */ - xlog_ticket_t *l_tail; /* free list of tickets */ - xlog_in_core_t *l_iclog; /* head log queue */ - lock_t l_icloglock; /* grab to change iclog state */ - xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed - * buffers */ - xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ +struct xlog { + /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ + struct xfs_ail *l_ailp; /* AIL log is working with */ + struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ + struct delayed_work l_work; /* background flush work */ + uint l_flags; + uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ + struct list_head *l_buf_cancel_table; + int l_iclog_hsize; /* size of iclog header */ + int l_iclog_heads; /* # of iclog header sectors */ + uint l_sectBBsize; /* sector size in BBs (2^n) */ + int l_iclog_size; /* size of log in bytes */ + int l_iclog_size_log; /* log power size of log */ + int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ int l_logBBsize; /* size of log in BB chunks */ + + /* The following block of fields are changed while holding icloglock */ + wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; + /* waiting for iclog flush */ + int l_covered_state;/* state of "covering disk + * log entries" */ + xlog_in_core_t *l_iclog; /* head log queue */ + spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ - int l_iclog_size; /* size of log in bytes */ - int l_iclog_size_log; /* log power size of log */ - int l_iclog_bufs; /* number of iclog buffers */ + + /* + * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and + * read without needing to hold specific locks. To avoid operations + * contending with other hot objects, place each of them on a separate + * cacheline. + */ + /* lsn of last LR on disk */ + atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; + /* lsn of 1st LR with unflushed * buffers */ + atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; + + struct xlog_grant_head l_reserve_head; + struct xlog_grant_head l_write_head; /* The following field are used for debugging; need to hold icloglock */ +#ifdef DEBUG char *l_iclog_bak[XLOG_MAX_ICLOGS]; +#endif - /* The following block of fields are changed while holding grant_lock */ - lock_t l_grant_lock; - xlog_ticket_t *l_reserve_headq; - xlog_ticket_t *l_write_headq; - int l_grant_reserve_cycle; - int l_grant_reserve_bytes; - int l_grant_write_cycle; - int l_grant_write_bytes; +}; - /* The following fields don't need locking */ -#ifdef XFS_LOG_TRACE - struct ktrace *l_trace; - struct ktrace *l_grant_trace; -#endif - uint l_flags; - uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ - struct xfs_buf_cancel **l_buf_cancel_table; - int l_iclog_hsize; /* size of iclog header */ - int l_iclog_heads; /* # of iclog header sectors */ - uint l_sectbb_log; /* log2 of sector size in BBs */ - uint l_sectbb_mask; /* sector size (in BBs) - * alignment mask */ -} xlog_t; +#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ + ((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE)) #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) - /* common routines */ -extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); -extern int xlog_find_tail(xlog_t *log, - xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk); -extern int xlog_recover(xlog_t *log); -extern int xlog_recover_finish(xlog_t *log, int mfsi_flags); -extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); -extern void xlog_recover_process_iunlinks(xlog_t *log); - -extern struct xfs_buf *xlog_get_bp(xlog_t *, int); -extern void xlog_put_bp(struct xfs_buf *); -extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); - -/* iclog tracing */ -#define XLOG_TRACE_GRAB_FLUSH 1 -#define XLOG_TRACE_REL_FLUSH 2 -#define XLOG_TRACE_SLEEP_FLUSH 3 -#define XLOG_TRACE_WAKE_FLUSH 4 - -#endif /* __KERNEL__ */ +extern int +xlog_recover( + struct xlog *log); +extern int +xlog_recover_finish( + struct xlog *log); + +extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, + char *dp, int size); + +extern kmem_zone_t *xfs_log_ticket_zone; +struct xlog_ticket * +xlog_ticket_alloc( + struct xlog *log, + int unit_bytes, + int count, + char client, + bool permanent, + xfs_km_flags_t alloc_flags); + + +static inline void +xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) +{ + *ptr += bytes; + *len -= bytes; + *off += bytes; +} + +void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); +int +xlog_write( + struct xlog *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *tic, + xfs_lsn_t *start_lsn, + struct xlog_in_core **commit_iclog, + uint flags); + +/* + * When we crack an atomic LSN, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. This should always + * be used to sample and crack LSNs that are stored and updated in atomic + * variables. + */ +static inline void +xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) +{ + xfs_lsn_t val = atomic64_read(lsn); + + *cycle = CYCLE_LSN(val); + *block = BLOCK_LSN(val); +} + +/* + * Calculate and assign a value to an atomic LSN variable from component pieces. + */ +static inline void +xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) +{ + atomic64_set(lsn, xlog_assign_lsn(cycle, block)); +} + +/* + * When we crack the grant head, we sample it first so that the value will not + * change while we are cracking it into the component values. This means we + * will always get consistent component values to work from. + */ +static inline void +xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) +{ + *cycle = val >> 32; + *space = val & 0xffffffff; +} + +static inline void +xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) +{ + xlog_crack_grant_head_val(atomic64_read(head), cycle, space); +} + +static inline int64_t +xlog_assign_grant_head_val(int cycle, int space) +{ + return ((int64_t)cycle << 32) | space; +} + +static inline void +xlog_assign_grant_head(atomic64_t *head, int cycle, int space) +{ + atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); +} + +/* + * Committed Item List interfaces + */ +int xlog_cil_init(struct xlog *log); +void xlog_cil_init_post_recovery(struct xlog *log); +void xlog_cil_destroy(struct xlog *log); +bool xlog_cil_empty(struct xlog *log); + +/* + * CIL force routines + */ +xfs_lsn_t +xlog_cil_force_lsn( + struct xlog *log, + xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct xlog *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} + +/* + * Unmount record type is used as a pseudo transaction type for the ticket. + * It's value must be outside the range of XFS_TRANS_* values. + */ +#define XLOG_UNMOUNT_REC_TYPE (-1U) + +/* + * Wrapper function for waiting on a wait queue serialised against wakeups + * by a spinlock. This matches the semantics of all the wait queues used in the + * log code. + */ +static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(wq, &wait); + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(lock); + schedule(); + remove_wait_queue(wq, &wait); +} #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 3cb678e3a13..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,115 +17,239 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_imap.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_rw.h" +#include "xfs_cksum.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_dir2.h" -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); -STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, - xlog_recover_item_t *item); +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); -STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) -#define xlog_recover_check_ail(mp, lip, gen) #endif +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ + +static inline int +xlog_buf_bbcount_valid( + struct xlog *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} -xfs_buf_t * +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, - int num_bblks) + struct xlog *log, + int nbblks) { - ASSERT(num_bblks > 0); + struct xfs_buf *bp; - if (log->l_sectbb_log) { - if (num_bblks > 1) - num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return NULL; } - return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); + + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to accommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accommodate this possibility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); + if (bp) + xfs_buf_unlock(bp); + return bp; } -void +STATIC void xlog_put_bp( xfs_buf_t *bp) { xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ +STATIC xfs_caddr_t +xlog_align( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp) +{ + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); + + ASSERT(offset + nbblks <= bp->b_length); + return bp->b_addr + BBTOB(offset); +} + /* * nbblks should be uint, but oh well. Just want to catch that 32-bit length. */ -int -xlog_bread( - xlog_t *log, +STATIC int +xlog_bread_noalign( + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); - - xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + bp->b_io_length = nbblks; + bp->b_error = 0; + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); return error; } +STATIC int +xlog_bread( + struct xlog *log, + xfs_daddr_t blk_no, + int nbblks, + struct xfs_buf *bp, + xfs_caddr_t *offset) +{ + int error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + if (error) + return error; + + *offset = xlog_align(log, blk_no, nbblks, bp); + return 0; +} + +/* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + struct xlog *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + struct xfs_buf *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = bp->b_addr; + int orig_len = BBTOB(bp->b_length); + int error, error2; + + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + /* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. @@ -133,51 +257,38 @@ xlog_bread( */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); + return EFSCORRUPTED; } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); + ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_HOLD(bp); - XFS_BUF_PSEMA(bp, PRIBIO); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); - - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); - return error; -} + xfs_buf_hold(bp); + xfs_buf_lock(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; -STATIC xfs_caddr_t -xlog_align( - xlog_t *log, - xfs_daddr_t blk_no, - int nbblks, - xfs_buf_t *bp) -{ - xfs_caddr_t ptr; - - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); - - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); - return ptr; + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + return error; } #ifdef DEBUG @@ -189,16 +300,10 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - int b; - - cmn_err(CE_DEBUG, "%s: SB : uuid = ", __FUNCTION__); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); - cmn_err(CE_DEBUG, " log : uuid = "); - for (b = 0; b < 16; b++) - cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); - cmn_err(CE_DEBUG, ", fmt = %d\n", INT_GET(head->h_fmt, ARCH_CONVERT)); + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", + __func__, &mp->m_sb.sb_uuid, XLOG_FMT); + xfs_debug(mp, " log : uuid = %pU, fmt = %d", + &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else #define xlog_header_check_dump(mp, head) @@ -212,23 +317,23 @@ xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(INT_GET(head->h_fmt, ARCH_CONVERT) != XLOG_FMT)) { - xlog_warn( - "XFS: dirty log written in incompatible format - can't recover"); + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(1)", XFS_ERRLEVEL_HIGH, mp); return XFS_ERROR(EFSCORRUPTED); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn( - "XFS: dirty log entry has mismatched uuid - can't recover"); + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(2)", XFS_ERRLEVEL_HIGH, mp); @@ -245,7 +350,7 @@ xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(INT_GET(head->h_magicno, ARCH_CONVERT) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); if (uuid_is_nil(&head->h_fs_uuid)) { /* @@ -253,9 +358,9 @@ xlog_header_check_mount( * h_fs_uuid is nil, we assume this log was last mounted * by IRIX and continue. */ - xlog_warn("XFS: nil uuid in log - IRIX style log"); + xfs_warn(mp, "nil uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn("XFS: log has mismatched uuid - can't recover"); + xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_mount", XFS_ERRLEVEL_HIGH, mp); @@ -268,23 +373,17 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - xfs_mount_t *mp; - - ASSERT(XFS_BUF_FSPRIVATE(bp, void *)); - - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! */ - mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *); - xfs_ioerror_alert("xlog_recover_iodone", - mp, bp, XFS_BUF_ADDR(bp)); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); } - XFS_BUF_SET_FSPRIVATE(bp, NULL); - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); } /* @@ -293,53 +392,52 @@ xlog_recover_iodone( * Note that the algorithm can not be perfect because the disk will not * necessarily be perfect. */ -int +STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { - if ((error = xlog_bread(log, mid_blk, 1, bp))) + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { + error = xlog_bread(log, mid_blk, 1, bp, &offset); + if (error) return error; - offset = xlog_align(log, mid_blk, 1, bp); - mid_cycle = GET_CYCLE(offset, ARCH_CONVERT); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + mid_cycle = xlog_get_cycle(offset); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -352,12 +450,18 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -366,12 +470,12 @@ xlog_find_verify_cycle( bcount = min(bufblks, (start_blk + nbblks - i)); - if ((error = xlog_bread(log, i, bcount, bp))) + error = xlog_bread(log, i, bcount, bp, &buf); + if (error) goto out; - buf = xlog_align(log, i, bcount, bp); for (j = 0; j < bcount; j++) { - cycle = GET_CYCLE(buf, ARCH_CONVERT); + cycle = xlog_get_cycle(buf); if (cycle == stop_on_cycle_no) { *new_blk = i+j; goto out; @@ -402,7 +506,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -423,32 +527,31 @@ xlog_find_verify_log_record( return ENOMEM; smallmem = 1; } else { - if ((error = xlog_bread(log, start_blk, num_blks, bp))) + error = xlog_bread(log, start_blk, num_blks, bp, &offset); + if (error) goto out; - offset = xlog_align(log, start_blk, num_blks, bp); offset += ((num_blks - 1) << BBSHIFT); } for (i = (*last_blk) - 1; i >= 0; i--) { if (i < start_blk) { /* valid log record not found */ - xlog_warn( - "XFS: Log inconsistent (didn't find previous header)"); + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); ASSERT(0); error = XFS_ERROR(EIO); goto out; } if (smallmem) { - if ((error = xlog_bread(log, i, 1, bp))) + error = xlog_bread(log, i, 1, bp, &offset); + if (error) goto out; - offset = xlog_align(log, i, 1, bp); } head = (xlog_rec_header_t *)offset; - if (XLOG_HEADER_MAGIC_NUM == - INT_GET(head->h_magicno, ARCH_CONVERT)) + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; if (!smallmem) @@ -479,8 +582,8 @@ xlog_find_verify_log_record( * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - uint h_size = INT_GET(head->h_size, ARCH_CONVERT); + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + uint h_size = be32_to_cpu(head->h_size); xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) @@ -489,8 +592,8 @@ xlog_find_verify_log_record( xhdrs = 1; } - if (*last_blk - i + extra_bblks - != BTOBB(INT_GET(head->h_len, ARCH_CONVERT)) + xhdrs) + if (*last_blk - i + extra_bblks != + BTOBB(be32_to_cpu(head->h_len)) + xhdrs) *last_blk = i; out: @@ -500,7 +603,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -513,7 +616,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -534,12 +637,12 @@ xlog_find_head( * mkfs etc write a dummy unmount record to a fresh * log so we can store the uuid in there */ - xlog_warn("XFS: totally zeroed log"); + xfs_warn(log->l_mp, "totally zeroed log"); } return 0; } else if (error) { - xlog_warn("XFS: empty log check failed"); + xfs_warn(log->l_mp, "empty log check failed"); return error; } @@ -547,16 +650,19 @@ xlog_find_head( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); - first_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + + first_half_cycle = xlog_get_cycle(offset); last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ - if ((error = xlog_bread(log, last_blk, 1, bp))) + error = xlog_bread(log, last_blk, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, last_blk, 1, bp); - last_half_cycle = GET_CYCLE(offset, ARCH_CONVERT); + + last_half_cycle = xlog_get_cycle(offset); ASSERT(last_half_cycle != 0); /* @@ -603,7 +709,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -613,11 +719,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -673,16 +781,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -700,7 +808,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -722,7 +830,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -757,7 +865,7 @@ xlog_find_head( xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to find log head"); + xfs_warn(log->l_mp, "failed to find log head"); return error; } @@ -777,9 +885,9 @@ xlog_find_head( * We could speed up search by using current head_blk buffer, but it is not * available. */ -int +STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -805,13 +913,14 @@ xlog_find_tail( if (!bp) return ENOMEM; if (*head_blk == 0) { /* special case */ - if ((error = xlog_bread(log, 0, 1, bp))) - goto bread_err; - offset = xlog_align(log, 0, 1, bp); - if (GET_CYCLE(offset, ARCH_CONVERT) == 0) { + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto done; + + if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -820,11 +929,11 @@ xlog_find_tail( */ ASSERT(*head_blk < INT_MAX); for (i = (int)(*head_blk) - 1; i >= 0; i--) { - if ((error = xlog_bread(log, i, 1, bp))) - goto bread_err; - offset = xlog_align(log, i, 1, bp); - if (XLOG_HEADER_MAGIC_NUM == - INT_GET(*(uint *)offset, ARCH_CONVERT)) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 1; break; } @@ -837,25 +946,27 @@ xlog_find_tail( */ if (!found) { for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { - if ((error = xlog_bread(log, i, 1, bp))) - goto bread_err; - offset = xlog_align(log, i, 1, bp); - if (XLOG_HEADER_MAGIC_NUM == - INT_GET(*(uint*)offset, ARCH_CONVERT)) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto done; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 2; break; } } } if (!found) { - xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } /* find blk_no of tail of log */ rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(INT_GET(rhead->h_tail_lsn, ARCH_CONVERT)); + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); /* * Reset log values according to the state of the log when we @@ -869,15 +980,15 @@ xlog_find_tail( */ log->l_prev_block = i; log->l_curr_block = (int)*head_blk; - log->l_curr_cycle = INT_GET(rhead->h_cycle, ARCH_CONVERT); + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = INT_GET(rhead->h_tail_lsn, ARCH_CONVERT); - log->l_last_sync_lsn = INT_GET(rhead->h_lsn, ARCH_CONVERT); - log->l_grant_reserve_cycle = log->l_curr_cycle; - log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); - log->l_grant_write_cycle = log->l_curr_cycle; - log->l_grant_write_bytes = BBTOB(log->l_curr_block); + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); /* * Look for unmount record. If we find it, then we know there @@ -890,9 +1001,9 @@ xlog_find_tail( * unmount record if there is one, so we pass the lsn of the * unmount record rather than the block after it. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - int h_size = INT_GET(rhead->h_size, ARCH_CONVERT); - int h_version = INT_GET(rhead->h_version, ARCH_CONVERT); + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rhead->h_size); + int h_version = be32_to_cpu(rhead->h_version); if ((h_version & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { @@ -906,15 +1017,15 @@ xlog_find_tail( hblks = 1; } after_umount_blk = (i + hblks + (int) - BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT))) % log->l_logBBsize; - tail_lsn = log->l_tail_lsn; + BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; + tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && - INT_GET(rhead->h_num_logops, ARCH_CONVERT) == 1) { + be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; - if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { - goto bread_err; - } - offset = xlog_align(log, umount_data_blk, 1, bp); + error = xlog_bread(log, umount_data_blk, 1, bp, &offset); + if (error) + goto done; + op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { /* @@ -922,11 +1033,19 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - ASSIGN_ANY_LSN_HOST(log->l_tail_lsn, log->l_curr_cycle, - after_umount_blk); - ASSIGN_ANY_LSN_HOST(log->l_last_sync_lsn, log->l_curr_cycle, - after_umount_blk); + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; + + /* + * Note that the unmount was clean. If the unmount + * was not clean, we need to know this to rebuild the + * superblock counters from the perag headers if we + * have a filesystem using non-persistent counters. + */ + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; } } @@ -949,16 +1068,14 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to locate log tail"); + xfs_warn(log->l_mp, "failed to locate log tail"); return error; } @@ -978,9 +1095,9 @@ exit: * -1 => use *blk_no as the first block of the log * >0 => error has occurred */ -int +STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -996,10 +1113,11 @@ xlog_find_zeroed( bp = xlog_get_bp(log, 1); if (!bp) return ENOMEM; - if ((error = xlog_bread(log, 0, 1, bp))) + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, 0, 1, bp); - first_cycle = GET_CYCLE(offset, ARCH_CONVERT); + + first_cycle = xlog_get_cycle(offset); if (first_cycle == 0) { /* completely zeroed log */ *blk_no = 0; xlog_put_bp(bp); @@ -1007,10 +1125,11 @@ xlog_find_zeroed( } /* check partially zeroed log */ - if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) + error = xlog_bread(log, log_bbnum-1, 1, bp, &offset); + if (error) goto bp_err; - offset = xlog_align(log, log_bbnum-1, 1, bp); - last_cycle = GET_CYCLE(offset, ARCH_CONVERT); + + last_cycle = xlog_get_cycle(offset); if (last_cycle != 0) { /* log completely written to */ xlog_put_bp(bp); return 0; @@ -1020,8 +1139,10 @@ xlog_find_zeroed( * the first block must be 1. If it's not, maybe we're * not looking at a log... Bail out. */ - xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1080,7 +1201,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1090,19 +1211,19 @@ xlog_add_record( xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; memset(buf, 0, BBSIZE); - INT_SET(recp->h_magicno, ARCH_CONVERT, XLOG_HEADER_MAGIC_NUM); - INT_SET(recp->h_cycle, ARCH_CONVERT, cycle); - INT_SET(recp->h_version, ARCH_CONVERT, - XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) ? 2 : 1); - ASSIGN_ANY_LSN_DISK(recp->h_lsn, cycle, block); - ASSIGN_ANY_LSN_DISK(recp->h_tail_lsn, tail_cycle, tail_block); - INT_SET(recp->h_fmt, ARCH_CONVERT, XLOG_FMT); + recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); + recp->h_cycle = cpu_to_be32(cycle); + recp->h_version = cpu_to_be32( + xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); + recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); + recp->h_fmt = cpu_to_be32(XLOG_FMT); memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); } STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1112,16 +1233,24 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1129,12 +1258,12 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { - if ((error = xlog_bread(log, start_block, 1, bp))) { - xlog_put_bp(bp); - return error; - } + error = xlog_bread_noalign(log, start_block, 1, bp); + if (error) + goto out_put_bp; + j = start_block - balign; } @@ -1148,14 +1277,14 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp); - balign = BBTOB(ealign - start_block); - XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); - if ((error = xlog_bread(log, ealign, sectbb, bp))) + offset = bp->b_addr + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); + if (error) break; - XFS_BUF_SET_PTR(bp, offset, bufblks); + } offset = xlog_align(log, start_block, endcount, bp); @@ -1170,6 +1299,8 @@ xlog_write_log_records( start_block += endcount; j = 0; } + + out_put_bp: xlog_put_bp(bp); return error; } @@ -1192,7 +1323,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -1315,41 +1446,50 @@ xlog_clear_stale_blocks( STATIC xlog_recover_t * xlog_recover_find_tid( - xlog_recover_t *q, + struct hlist_head *head, xlog_tid_t tid) { - xlog_recover_t *p = q; + xlog_recover_t *trans; - while (p != NULL) { - if (p->r_log_tid == tid) - break; - p = p->r_next; + hlist_for_each_entry(trans, head, r_list) { + if (trans->r_log_tid == tid) + return trans; } - return p; + return NULL; } STATIC void -xlog_recover_put_hashq( - xlog_recover_t **q, - xlog_recover_t *trans) +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) { - trans->r_next = *q; - *q = trans; + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); } STATIC void xlog_recover_add_item( - xlog_recover_item_t **itemq) + struct list_head *head) { xlog_recover_item_t *item; item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - xlog_recover_insert_item_backq(itemq, item); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); } STATIC int xlog_recover_add_to_cont_trans( - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1357,8 +1497,7 @@ xlog_recover_add_to_cont_trans( xfs_caddr_t ptr, old_ptr; int old_len; - item = trans->r_itemq; - if (item == 0) { + if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + @@ -1366,15 +1505,17 @@ xlog_recover_add_to_cont_trans( memcpy(ptr, dp, len); /* d, s, l */ return 0; } - item = item->ri_prev; + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1393,7 +1534,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1403,9 +1545,14 @@ xlog_recover_add_to_trans( if (!len) return 0; - item = trans->r_itemq; - if (item == 0) { - ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC); + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return XFS_ERROR(EIO); + } if (len == sizeof(xfs_trans_header_t)) xlog_recover_add_item(&trans->r_itemq); memcpy(&trans->r_theader, dp, len); /* d, s, l */ @@ -1416,150 +1563,160 @@ xlog_recover_add_to_trans( memcpy(ptr, dp, len); in_f = (xfs_inode_log_format_t *)ptr; - if (item->ri_prev->ri_total != 0 && - item->ri_prev->ri_total == item->ri_prev->ri_cnt) { + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); } - item = trans->r_itemq; - item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + kmem_free(ptr); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } -STATIC void -xlog_recover_new_tid( - xlog_recover_t **q, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - xlog_recover_put_hashq(q, trans); -} - -STATIC int -xlog_recover_unlink_tid( - xlog_recover_t **q, - xlog_recover_t *trans) -{ - xlog_recover_t *tp; - int found = 0; - - ASSERT(trans != 0); - if (trans == *q) { - *q = (*q)->r_next; - } else { - tp = *q; - while (tp != 0) { - if (tp->r_next == trans) { - found = 1; - break; - } - tp = tp->r_next; - } - if (!found) { - xlog_warn( - "XFS: xlog_recover_unlink_tid: trans not found"); - ASSERT(0); - return XFS_ERROR(EIO); - } - tp->r_next = tp->r_next->r_next; - } - return 0; -} - -STATIC void -xlog_recover_insert_item_backq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - if (*q == 0) { - item->ri_prev = item->ri_next = item; - *q = item; - } else { - item->ri_next = *q; - item->ri_prev = (*q)->ri_prev; - (*q)->ri_prev = item; - item->ri_prev->ri_next = item; - } -} - -STATIC void -xlog_recover_insert_item_frontq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - xlog_recover_insert_item_backq(q, item); - *q = item; -} - +/* + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. + */ STATIC int xlog_recover_reorder_trans( - xlog_t *log, - xlog_recover_t *trans) + struct xlog *log, + struct xlog_recover *trans, + int pass) { - xlog_recover_item_t *first_item, *itemq, *itemq_next; - xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; - ushort flags = 0; - - first_item = itemq = trans->r_itemq; - trans->r_itemq = NULL; - do { - itemq_next = itemq->ri_next; - buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { - case XFS_LI_BUF: - flags = buf_f->blf_flags; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - flags = obuf_f->blf_flags; + xlog_recover_item_t *item, *n; + int error = 0; + LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); + + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + + switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); break; - } - - switch (ITEM_TYPE(itemq)) { case XFS_LI_BUF: - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - if (!(flags & XFS_BLI_CANCEL)) { - xlog_recover_insert_item_frontq(&trans->r_itemq, - itemq); + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); break; } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: - case XFS_LI_6_1_INODE: - case XFS_LI_5_3_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: - xlog_recover_insert_item_backq(&trans->r_itemq, itemq); + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &inode_list); break; default: - xlog_warn( - "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } - itemq = itemq_next; - } while (first_item != itemq); - return 0; + } +out: + ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; } /* @@ -1574,258 +1731,160 @@ xlog_recover_reorder_trans( * record in the table to tell us how many times we expect to see this * record during the second pass. */ -STATIC void -xlog_recover_do_buffer_pass1( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +STATIC int +xlog_recover_buffer_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *nextp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; - xfs_buf_log_format_v1_t *obuf_f; - xfs_daddr_t blkno = 0; - uint len = 0; - ushort flags = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; - } + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) - return; - - /* - * Insert an xfs_buf_cancel record into the hash table of - * them. If there is already an identical record, bump - * its reference count. - */ - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - /* - * If the hash bucket is empty then just insert a new record into - * the bucket. - */ - if (*bucket == NULL) { - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; - bcp->bc_refcount = 1; - bcp->bc_next = NULL; - *bucket = bcp; - return; + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); + return 0; } /* - * The hash bucket is not empty, so search for duplicates of our - * record. If we find one them just bump its refcount. If not - * then add us at the end of the list. + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. */ - prevp = NULL; - nextp = *bucket; - while (nextp != NULL) { - if (nextp->bc_blkno == blkno && nextp->bc_len == len) { - nextp->bc_refcount++; - return; + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); + return 0; } - prevp = nextp; - nextp = nextp->bc_next; } - ASSERT(prevp != NULL); - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; - bcp->bc_next = NULL; - prevp->bc_next = bcp; + list_add_tail(&bcp->bc_list, bucket); + + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; } /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( - xlog_t *log, +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; } - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - bcp = *bucket; - if (bcp == NULL) { - /* - * There is no corresponding entry in the table built - * in pass one, so this buffer has not been cancelled. - */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; } /* - * Search for an entry in the buffer cancel table that - * matches our buffer. - */ - prevp = NULL; - while (bcp != NULL) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) { - /* - * We've go a match, so return 1 so that the - * recovery of this buffer is cancelled. - * If this buffer is actually a buffer cancel - * log item, then decrement the refcount on the - * one in the table and remove it if this is the - * last reference. - */ - if (flags & XFS_BLI_CANCEL) { - bcp->bc_refcount--; - if (bcp->bc_refcount == 0) { - if (prevp == NULL) { - *bucket = bcp->bc_next; - } else { - prevp->bc_next = bcp->bc_next; - } - kmem_free(bcp, - sizeof(xfs_buf_cancel_t)); - } - } - return 1; - } - prevp = bcp; - bcp = bcp->bc_next; - } - /* - * We didn't find a corresponding entry in the table, so - * return 0 so that the buffer is NOT cancelled. + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; } +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ STATIC int -xlog_recover_do_buffer_pass2( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) { - xfs_buf_log_format_v1_t *obuf_f; - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; + struct xfs_buf_cancel *bcp; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - flags = obuf_f->blf_flags; - len = (xfs_daddr_t) obuf_f->blf_len; - break; - } + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; - return xlog_check_buffer_cancelled(log, blkno, len, flags); + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; } /* - * Perform recovery for a buffer full of inodes. In these buffers, - * the only data which should be recovered is that which corresponds - * to the di_next_unlinked pointers in the on disk inode structures. - * The rest of the data for the inodes is always logged through the - * inodes themselves rather than the inode buffer and is recovered - * in xlog_recover_do_inode_trans(). + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * - * The only time when buffers full of inodes are fully recovered is - * when the buffer is full of newly allocated inodes. In this case - * the buffer will not be marked as an inode buffer and so will be - * sent to xlog_recover_do_reg_buffer() below during recovery. + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; - int item_index; - int bit; - int nbits; - int reg_buf_offset; - int reg_buf_bytes; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - xfs_buf_log_format_v1_t *obuf_f; - unsigned int *data_map = NULL; - unsigned int map_size = 0; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; - } + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + /* - * Set the variables corresponding to the current region to - * 0 so that we'll initialize them on the first pass through - * the loop. + * Post recovery validation only works properly on CRC enabled + * filesystems. */ - reg_buf_offset = 0; - reg_buf_bytes = 0; - bit = 0; - nbits = 0; - item_index = 0; - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1839,21 +1898,21 @@ xlog_recover_do_inode_buffer( * the current di_next_unlinked field. */ bit += nbits; - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ - if (bit == -1) { + if (bit == -1) return 0; - } - nbits = xfs_contig_bits(data_map, map_size, - bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1862,25 +1921,25 @@ xlog_recover_do_inode_buffer( * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ - if (next_unlinked_offset < reg_buf_offset) { + if (next_unlinked_offset < reg_buf_offset) continue; - } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = (xfs_agino_t *) - ((char *)(item->ri_buf[item_index].i_addr) + - (next_unlinked_offset - reg_buf_offset)); + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { - xfs_fs_cmn_err(CE_ALERT, mp, - "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", XFS_ERRLEVEL_LOW, mp); @@ -1890,57 +1949,405 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -/*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; - xfs_buf_log_format_v1_t *obuf_f; - unsigned int *data_map = NULL; - unsigned int map_size = 0; int error; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - data_map = obuf_f->blf_data_map; - map_size = obuf_f->blf_map_size; - break; - } + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); if (bit == -1) break; - nbits = xfs_contig_bits(data_map, map_size, bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); - ASSERT(item->ri_buf[i].i_addr != 0); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ASSERT(item->ri_buf[i].i_addr != NULL); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1949,164 +2356,67 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { - error = xfs_qm_dqcheck((xfs_disk_dquot_t *) - item->ri_buf[i].i_addr, + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { + if (item->ri_buf[i].i_addr == NULL) { + xfs_alert(mp, + "XFS: NULL dquot in %s.", __func__); + goto next; + } + if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(mp, + "XFS: dquot too small (%d) in %s.", + item->ri_buf[i].i_len, __func__); + goto next; + } + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); + if (error) + goto next; } - if (!error) - memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ - item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + + memcpy(xfs_buf_offset(bp, + (uint)bit << XFS_BLF_SHIFT), /* dest */ + item->ri_buf[i].i_addr, /* source */ + nbits<<XFS_BLF_SHIFT); /* length */ + next: i++; bit += nbits; } /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); -} - -/* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - - /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. - */ - if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); /* - * Typically, a repair is only requested by quotacheck. + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - return errs; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2115,11 +2425,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2140,7 +2450,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2150,96 +2460,67 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int -xlog_recover_do_buffer_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_buffer_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { - xfs_buf_log_format_t *buf_f; - xfs_buf_log_format_v1_t *obuf_f; - xfs_mount_t *mp; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; - int cancel; - xfs_daddr_t blkno; - int len; - ushort flags; - - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; + uint buf_flags; + xfs_lsn_t lsn; - if (pass == XLOG_RECOVER_PASS1) { - /* - * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. - */ - xlog_recover_do_buffer_pass1(log, buf_f); + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; - } else { - /* - * In this pass we want to recover all the buffers - * which have not been cancelled and are not - * cancellation buffers themselves. The routine - * we call here will tell us whether or not to - * continue with the replay of this buffer. - */ - cancel = xlog_recover_do_buffer_pass2(log, buf_f); - if (cancel) { - return 0; - } - } - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = obuf_f->blf_blkno; - len = obuf_f->blf_len; - flags = obuf_f->blf_flags; - break; - default: - xfs_fs_cmn_err(CE_ALERT, log->l_mp, - "xfs_log_recover: unknown buffer type 0x%x, logdev %s", - buf_f->blf_type, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); - XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", - XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); } - mp = log->l_mp; - if (flags & XFS_BLI_INODE_BUF) { - bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, - XFS_BUF_LOCK); - } else { - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); - } - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); - return error; + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, NULL); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); + goto out_release; } - error = 0; - if (flags & XFS_BLI_INODE_BUF) { + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2247,239 +2528,316 @@ xlog_recover_do_buffer_trans( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == - INT_GET(*((__uint16_t *)(xfs_buf_offset(bp, 0))), ARCH_CONVERT) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + } + +out_release: + xfs_buf_relse(bp); + return error; +} + +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; } - return (error); + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; } STATIC int -xlog_recover_do_inode_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_inode_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; - xfs_imap_t imap; xfs_dinode_t *dip; - xfs_ino_t ino; int len; xfs_caddr_t src; xfs_caddr_t dest; int error; int attr_index; uint fields; - xfs_dinode_core_t *dicp; + xfs_icdinode_t *dicp; + uint isize; int need_free = 0; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { - in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; + in_f = item->ri_buf[0].i_addr; } else { - in_f = (xfs_inode_log_format_t *)kmem_alloc( - sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) goto error; } - ino = in_f->ilf_ino; - mp = log->l_mp; - if (ITEM_TYPE(item) == XFS_LI_INODE) { - imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno; - imap.im_len = in_f->ilf_len; - imap.im_boffset = in_f->ilf_boffset; - } else { - /* - * It's an old inode format record. We don't know where - * its cluster is located on disk, and we can't allow - * xfs_imap() to figure it out because the inode btrees - * are not ready to be used. Therefore do not pass the - * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give - * us only the single block in which the inode lives - * rather than its cluster, so we must make sure to - * invalidate the buffer when we write it out below. - */ - imap.im_blkno = 0; - xfs_imap(log->l_mp, NULL, ino, &imap, 0); - } /* * Inode buffers can be freed, look out for it, * and do not replay the inode. */ - if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) { + if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, + in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len, - XFS_BUF_LOCK); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, imap.im_blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); + if (!bp) { + error = ENOMEM; goto error; } - error = 0; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); + goto out_release; + } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); + dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", - dip, bp, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } - dicp = (xfs_dinode_core_t*)(item->ri_buf[1].i_addr); + dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", - item, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < - INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT)) { + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers */ - if ((INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT) - == DI_MAX_FLUSH) && - (dicp->di_flushiter < (DI_MAX_FLUSH>>1))) { + if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && + dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", - item, dip, bp, ino, + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", - item, dip, bp, ino, dicp->di_forkoff); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } - if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", - item->ri_buf[1].i_len, item); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ - xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core, - (xfs_dinode_core_t*)item->ri_buf[1].i_addr, -1); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) { - memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t), - item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t), - item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { case XFS_ILOG_DEV: - INT_SET(dip->di_u.di_dev, ARCH_CONVERT, in_f->ilf_u.ilfu_rdev); - + xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); break; case XFS_ILOG_UUID: - dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid; + memcpy(XFS_DFORK_DPTR(dip), + &in_f->ilf_u.ilfu_uuid, + sizeof(uuid_t)); break; } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2490,12 +2848,12 @@ xlog_recover_do_inode_trans( switch (fields & XFS_ILOG_DFORK) { case XFS_ILOG_DDATA: case XFS_ILOG_DEXT: - memcpy(&dip->di_u, src, len); + memcpy(XFS_DFORK_DPTR(dip), src, len); break; case XFS_ILOG_DBROOT: - xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, - &(dip->di_u.di_bmbt), + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, + (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), XFS_DFORK_DSIZE(dip, mp)); break; @@ -2532,56 +2890,49 @@ xlog_recover_do_inode_trans( case XFS_ILOG_ABROOT: dest = XFS_DFORK_APTR(dip); - xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len, - (xfs_bmdr_block_t*)dest, + xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, + len, (xfs_bmdr_block_t*)dest, XFS_DFORK_ASIZE(dip, mp)); break; default: - xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: - if (ITEM_TYPE(item) == XFS_LI_INODE) { - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); - } else { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); - } +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); error: if (need_free) - kmem_free(in_f, sizeof(*in_f)); + kmem_free(in_f); return XFS_ERROR(error); } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int -xlog_recover_do_quotaoff_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_quotaoff_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_qoff_logformat_t *qoff_f; - - if (pass == XLOG_RECOVER_PASS2) { - return (0); - } - - qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2602,22 +2953,19 @@ xlog_recover_do_quotaoff_trans( * Recover a dquot record */ STATIC int -xlog_recover_do_dquot_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_dquot_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - mp = log->l_mp; /* * Filesystems are required to send in quota flags at mount time. @@ -2625,13 +2973,21 @@ xlog_recover_do_dquot_trans( if (mp->m_qflags == 0) return (0); - recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; - ASSERT(recddq); + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); + return XFS_ERROR(EIO); + } + if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", + item->ri_buf[1].i_len, __func__); + return XFS_ERROR(EIO); + } + /* * This type of quotas was turned off, so ignore this record. */ - type = INT_GET(recddq->d_flags, ARCH_CONVERT) & - (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); ASSERT(type); if (log->l_quotaoffs_flag & type) return (0); @@ -2646,25 +3002,20 @@ xlog_recover_do_dquot_trans( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - if ((error = xfs_qm_dqcheck(recddq, - dq_f->qlf_id, - 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans (log copy)"))) { + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) return XFS_ERROR(EIO); - } ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2673,22 +3024,40 @@ xlog_recover_do_dquot_trans( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans")) { + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2"); + if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || - XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); - XFS_BUF_SET_FSPRIVATE(bp, mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -2699,39 +3068,31 @@ xlog_recover_do_dquot_trans( * LSN. */ STATIC int -xlog_recover_do_efi_trans( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn, - int pass) +xlog_recover_efi_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - SPLDECL(s); - - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].i_addr; - mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } - efip->efi_next_extent = efi_formatp->efi_nextents; - efip->efi_flags |= XFS_EFI_COMMITTED; + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); - AIL_LOCK(mp,s); + spin_lock(&log->l_ailp->xa_lock); /* - * xfs_trans_update_ail() drops the AIL lock. + * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn, s); + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } @@ -2744,25 +3105,19 @@ xlog_recover_do_efi_trans( * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ -STATIC void -xlog_recover_do_efd_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +STATIC int +xlog_recover_efd_pass2( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_mount_t *mp; xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; xfs_log_item_t *lip; - int gen; __uint64_t efi_id; - SPLDECL(s); - - if (pass == XLOG_RECOVER_PASS1) { - return; - } + struct xfs_ail_cursor cur; + struct xfs_ail *ailp = log->l_ailp; - efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + @@ -2773,97 +3128,116 @@ xlog_recover_do_efd_trans( * Search for the efi with the id in the efd format structure * in the AIL. */ - mp = log->l_mp; - AIL_LOCK(mp,s); - lip = xfs_trans_first_ail(mp, &gen); + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { if (lip->li_type == XFS_LI_EFI) { efip = (xfs_efi_log_item_t *)lip; if (efip->efi_format.efi_id == efi_id) { /* - * xfs_trans_delete_ail() drops the + * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_delete_ail(mp, lip, s); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); + xfs_efi_item_free(efip); + spin_lock(&ailp->xa_lock); break; } } - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + lip = xfs_trans_ail_cursor_next(ailp, &cur); } + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); - /* - * If we found it, then free it up. If it wasn't there, it - * must have been overwritten in the log. Oh well. - */ - if (lip != NULL) { - xfs_efi_item_free(efip); - } else { - AIL_UNLOCK(mp, s); - } + return 0; } /* - * Perform the transaction - * - * If the transaction modifies a buffer or inode, do it now. Otherwise, - * EFIs and EFDs get queued up by adding entries into the AIL for them. + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. */ STATIC int -xlog_recover_do_trans( - xlog_t *log, - xlog_recover_t *trans, - int pass) +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) { - int error = 0; - xlog_recover_item_t *item, *first_item; + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } - if ((error = xlog_recover_reorder_trans(log, trans))) - return error; - first_item = item = trans->r_itemq; - do { - /* - * we don't need to worry about the block number being - * truncated in > 1 TB buffers because in user-land, - * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so - * the blknos will get through the user-mode buffer - * cache properly. The only bad case is o32 kernels - * where xfs_daddr_t is 32-bits but mount will warn us - * off a > 1 TB filesystem before we get here. - */ - if ((ITEM_TYPE(item) == XFS_LI_BUF) || - (ITEM_TYPE(item) == XFS_LI_6_1_BUF) || - (ITEM_TYPE(item) == XFS_LI_5_3_BUF)) { - if ((error = xlog_recover_do_buffer_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) { - if ((error = xlog_recover_do_inode_trans(log, item, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFI) { - if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn, - pass))) - break; - } else if (ITEM_TYPE(item) == XFS_LI_EFD) { - xlog_recover_do_efd_trans(log, item, pass); - } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { - if ((error = xlog_recover_do_dquot_trans(log, item, - pass))) - break; - } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { - if ((error = xlog_recover_do_quotaoff_trans(log, item, - pass))) - break; - } else { - xlog_warn("XFS: xlog_recover_do_trans"); - ASSERT(0); - error = XFS_ERROR(EIO); - break; - } - item = item->ri_next; - } while (first_item != item); + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } - return error; + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; } /* @@ -2873,52 +3247,285 @@ xlog_recover_do_trans( */ STATIC void xlog_recover_free_trans( - xlog_recover_t *trans) + struct xlog_recover *trans) { - xlog_recover_item_t *first_item, *item, *free_item; + xlog_recover_item_t *item, *n; int i; - item = first_item = trans->r_itemq; - do { - free_item = item; - item = item->ri_next; - /* Free the regions in the item. */ - for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr, - free_item->ri_buf[i].i_len); - } + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(free_item->ri_buf, - (free_item->ri_total * sizeof(xfs_log_iovec_t))); - kmem_free(free_item, sizeof(xlog_recover_item_t)); - } while (first_item != item); + kmem_free(item->ri_buf); + kmem_free(item); + } /* Free the transaction recover structure */ - kmem_free(trans, sizeof(xlog_recover_t)); + kmem_free(trans); +} + +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } } STATIC int +xlog_recover_commit_pass1( + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + case XFS_LI_ICREATE: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ +STATIC int xlog_recover_commit_trans( - xlog_t *log, - xlog_recover_t **q, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { - int error; + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); - if ((error = xlog_recover_unlink_tid(q, trans))) - return error; - if ((error = xlog_recover_do_trans(log, trans, pass))) + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 + + hlist_del(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) return error; - xlog_recover_free_trans(trans); /* no error */ - return 0; + + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: + error = xlog_recover_commit_pass1(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + + if (error) + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + + xlog_recover_free_trans(trans); + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - xlog_recover_t *trans) + struct xlog *log) { /* Do nothing now */ - xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); return 0; } @@ -2933,9 +3540,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, - xlog_recover_t *rhash[], - xlog_rec_header_t *rhead, + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2948,8 +3555,8 @@ xlog_recover_process_data( unsigned long hash; uint flags; - lp = dp + INT_GET(rhead->h_len, ARCH_CONVERT); - num_logops = INT_GET(rhead->h_num_logops, ARCH_CONVERT); + lp = dp + be32_to_cpu(rhead->h_len); + num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) @@ -2961,59 +3568,65 @@ xlog_recover_process_data( dp += sizeof(xlog_op_header_t); if (ohead->oh_clientid != XFS_TRANSACTION && ohead->oh_clientid != XFS_LOG) { - xlog_warn( - "XFS: xlog_recover_process_data: bad clientid"); + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); ASSERT(0); return (XFS_ERROR(EIO)); } - tid = INT_GET(ohead->oh_tid, ARCH_CONVERT); + tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(rhash[hash], tid); + trans = xlog_recover_find_tid(&rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, - INT_GET(rhead->h_lsn, ARCH_CONVERT)); + be64_to_cpu(rhead->h_lsn)); } else { - ASSERT(dp+INT_GET(ohead->oh_len, ARCH_CONVERT) <= lp); + if (dp + be32_to_cpu(ohead->oh_len) > lp) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", + __func__, be32_to_cpu(ohead->oh_len)); + WARN_ON(1); + return (XFS_ERROR(EIO)); + } flags = ohead->oh_flags & ~XLOG_END_TRANS; if (flags & XLOG_WAS_CONT_TRANS) flags &= ~XLOG_CONTINUE_TRANS; switch (flags) { case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, - &rhash[hash], trans, pass); + trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, INT_GET(ohead->oh_len, - ARCH_CONVERT)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: - xlog_warn( - "XFS: xlog_recover_process_data: bad transaction"); + xfs_warn(log->l_mp, "%s: bad transaction", + __func__); ASSERT(0); error = XFS_ERROR(EIO); break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, - dp, INT_GET(ohead->oh_len, - ARCH_CONVERT)); + error = xlog_recover_add_to_trans(log, trans, + dp, be32_to_cpu(ohead->oh_len)); break; default: - xlog_warn( - "XFS: xlog_recover_process_data: bad flag"); + xfs_warn(log->l_mp, "%s: bad flag 0x%x", + __func__, flags); ASSERT(0); error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } - dp += INT_GET(ohead->oh_len, ARCH_CONVERT); + dp += be32_to_cpu(ohead->oh_len); num_logops--; } return 0; @@ -3023,7 +3636,7 @@ xlog_recover_process_data( * Process an extent free intent item that was recovered from * the log. We need to free the extents that it describes. */ -STATIC void +STATIC int xlog_recover_process_efi( xfs_mount_t *mp, xfs_efi_log_item_t *efip) @@ -3031,10 +3644,11 @@ xlog_recover_process_efi( xfs_efd_log_item_t *efdp; xfs_trans_t *tp; int i; + int error = 0; xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the @@ -3053,52 +3667,35 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); - return; + return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) + goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &(efip->efi_format.efi_extents[i]); - xfs_free_extent(tp, extp->ext_start, extp->ext_len); + error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); + if (error) + goto abort_error; xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, extp->ext_len); } - efip->efi_flags |= XFS_EFI_RECOVERED; - xfs_trans_commit(tp, 0, NULL); -} - -/* - * Verify that once we've encountered something other than an EFI - * in the AIL that there are no more EFIs in the AIL. - */ -#if defined(DEBUG) -STATIC void -xlog_recover_check_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - int gen) -{ - int orig_gen = gen; + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); + error = xfs_trans_commit(tp, 0); + return error; - do { - ASSERT(lip->li_type != XFS_LI_EFI); - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); - /* - * The check will be bogus if we restart from the - * beginning of the AIL, so ASSERT that we don't. - * We never should since we're holding the AIL lock - * the entire time. - */ - ASSERT(gen == orig_gen); - } while (lip != NULL); +abort_error: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); + return error; } -#endif /* DEBUG */ /* * When this is called, all of the EFIs which did not have @@ -3118,26 +3715,29 @@ xlog_recover_check_ail( * everything already in the AIL, we stop processing as soon as * we see something other than an EFI in the AIL. */ -STATIC void +STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; - int gen; - xfs_mount_t *mp; - SPLDECL(s); - - mp = log->l_mp; - AIL_LOCK(mp,s); + int error = 0; + struct xfs_ail_cursor cur; + struct xfs_ail *ailp; - lip = xfs_trans_first_ail(mp, &gen); + ailp = log->l_ailp; + spin_lock(&ailp->xa_lock); + lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); while (lip != NULL) { /* * We're done when we see something other than an EFI. + * There should be no EFIs left in the AIL now. */ if (lip->li_type != XFS_LI_EFI) { - xlog_recover_check_ail(mp, lip, gen); +#ifdef DEBUG + for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) + ASSERT(lip->li_type != XFS_LI_EFI); +#endif break; } @@ -3145,17 +3745,22 @@ xlog_recover_process_efis( * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_flags & XFS_EFI_RECOVERED) { - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { + lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } - AIL_UNLOCK(mp, s); - xlog_recover_process_efi(mp, efip); - AIL_LOCK(mp,s); - lip = xfs_trans_next_ail(mp, lip, &gen, NULL); + spin_unlock(&ailp->xa_lock); + error = xlog_recover_process_efi(log->l_mp, efip); + spin_lock(&ailp->xa_lock); + if (error) + goto out; + lip = xfs_trans_ail_cursor_next(ailp, &cur); } - AIL_UNLOCK(mp, s); +out: + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + return error; } /* @@ -3175,29 +3780,87 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); + if (error) + goto out_abort; - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &agibp); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } + error = xfs_read_agi(mp, tp, agno, &agibp); + if (error) + goto out_abort; agi = XFS_BUF_TO_AGI(agibp); - if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); - return; - } - agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - (void) xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + if (error) + goto out_error; + return; + +out_abort: + xfs_trans_cancel(tp, XFS_TRANS_ABORT); +out_error: + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); + return; +} + +STATIC xfs_agino_t +xlog_recover_process_one_iunlink( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agino_t agino, + int bucket) +{ + struct xfs_buf *ibp; + struct xfs_dinode *dip; + struct xfs_inode *ip; + xfs_ino_t ino; + int error; + + ino = XFS_AGINO_TO_INO(mp, agno, agino); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + goto fail; + + /* + * Get the on disk inode to find the next inode in the bucket. + */ + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); + if (error) + goto fail_iput; + + ASSERT(ip->i_d.di_nlink == 0); + ASSERT(ip->i_d.di_mode != 0); + + /* setup for the next pass */ + agino = be32_to_cpu(dip->di_next_unlinked); + xfs_buf_relse(ibp); + + /* + * Prevent any DMAPI event from being sent when the reference on + * the inode is dropped. + */ + ip->i_d.di_dmevmask = 0; + + IRELE(ip); + return agino; + + fail_iput: + IRELE(ip); + fail: + /* + * We can't read in the inode this bucket points to, or this inode + * is messed up. Just ditch this bucket of inodes. We will lose + * some inodes and space, but at least we won't hang. + * + * Call xlog_recover_clear_agi_bucket() to perform a transaction to + * clear the inode pointer in the bucket. + */ + xlog_recover_clear_agi_bucket(mp, agno, bucket); + return NULLAGINO; } /* @@ -3212,19 +3875,15 @@ xlog_recover_clear_agi_bucket( * freeing of the inode and its removal from the list must be * atomic. */ -void +STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; xfs_agi_t *agi; xfs_buf_t *agibp; - xfs_buf_t *ibp; - xfs_dinode_t *dip; - xfs_inode_t *ip; xfs_agino_t agino; - xfs_ino_t ino; int bucket; int error; uint mp_dmevmask; @@ -3241,279 +3900,136 @@ xlog_recover_process_iunlinks( /* * Find the agi for this ag. */ - agibp = xfs_buf_read(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)", - log->l_mp, agibp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp))); + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + /* + * AGI is b0rked. Don't process it. + * + * We should probably mark the filesystem as corrupt + * after we've recovered all the ag's we can.... + */ + continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum)); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { - agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - - ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); - ASSERT(error || (ip != NULL)); - - if (!error) { - /* - * Get the on disk inode to find the - * next inode in the bucket. - */ - error = xfs_itobp(mp, NULL, ip, &dip, - &ibp, 0, 0); - ASSERT(error || (dip != NULL)); - } - - if (!error) { - ASSERT(ip->i_d.di_nlink == 0); - - /* setup for the next pass */ - agino = INT_GET(dip->di_next_unlinked, - ARCH_CONVERT); - xfs_buf_relse(ibp); - /* - * Prevent any DMAPI event from - * being sent when the - * reference on the inode is - * dropped. - */ - ip->i_d.di_dmevmask = 0; - - /* - * If this is a new inode, handle - * it specially. Otherwise, - * just drop our reference to the - * inode. If there are no - * other references, this will - * send the inode to - * xfs_inactive() which will - * truncate the file and free - * the inode. - */ - if (ip->i_d.di_mode == 0) - xfs_iput_new(ip, 0); - else - VN_RELE(XFS_ITOV(ip)); - } else { - /* - * We can't read in the inode - * this bucket points to, or - * this inode is messed up. Just - * ditch this bucket of inodes. We - * will lose some inodes and space, - * but at least we won't hang. Call - * xlog_recover_clear_agi_bucket() - * to perform a transaction to clear - * the inode pointer in the bucket. - */ - xlog_recover_clear_agi_bucket(mp, agno, - bucket); - - agino = NULLAGINO; - } - - /* - * Reacquire the agibuffer and continue around - * the loop. - */ - agibp = xfs_buf_read(mp->m_ddev_targp, - XFS_AG_DADDR(mp, agno, - XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert( - "xlog_recover_process_iunlinks(#2)", - log->l_mp, agibp, - XFS_AG_DADDR(mp, agno, - XFS_AGI_DADDR(mp))); - } - agi = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu( - agi->agi_magicnum)); + agino = xlog_recover_process_one_iunlink(mp, + agno, agino, bucket); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) -{ - int i; - uint *up; - uint chksum = 0; - - up = (uint *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= INT_GET(*up, ARCH_CONVERT); - up++; - } - INT_SET(iclog->ic_header.h_chksum, ARCH_CONVERT, chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - uint cycle_lsn; - xfs_caddr_t dp; - xlog_in_core_2_t *xhdr; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(uint *)dp; - *(uint *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - xhdr = (xlog_in_core_2_t *)&iclog->ic_header; - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(uint *)dp; - *(uint *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } -} - -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - uint *up = (uint *)dp; - uint chksum = 0; - int i; - /* divide length by 4 to get # words */ - for (i=0; i < INT_GET(rhead->h_len, ARCH_CONVERT) >> 2; i++) { - chksum ^= INT_GET(*up, ARCH_CONVERT); - up++; - } - if (chksum != INT_GET(rhead->h_chksum, ARCH_CONVERT)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - INT_GET(rhead->h_chksum, ARCH_CONVERT), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } + return 0; } -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif -STATIC void +STATIC int xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; - xlog_in_core_2_t *xhdr; + int error; - for (i = 0; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)) && + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(uint *)dp = *(uint *)&rhead->h_cycle_data[i]; + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; dp += BBSIZE; } - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { - xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); i++) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(uint *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; dp += BBSIZE; } } - xlog_unpack_data_checksum(rhead, dp, log); + return 0; } STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; - if (unlikely( - (INT_GET(rhead->h_magicno, ARCH_CONVERT) != - XLOG_HEADER_MAGIC_NUM))) { + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); } if (unlikely( (!rhead->h_version || - (INT_GET(rhead->h_version, ARCH_CONVERT) & - (~XLOG_VERSION_OKBITS)) != 0))) { - xlog_warn("XFS: %s: unrecognised log version (%d).", - __FUNCTION__, INT_GET(rhead->h_version, ARCH_CONVERT)); + (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", + __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } /* LR body must have data or it wouldn't have been written */ - hlen = INT_GET(rhead->h_len, ARCH_CONVERT); + hlen = be32_to_cpu(rhead->h_len); if (unlikely( hlen <= 0 || hlen > INT_MAX )) { XFS_ERROR_REPORT("xlog_valid_rec_header(2)", XFS_ERRLEVEL_LOW, log->l_mp); @@ -3537,19 +4053,19 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t bufaddr, offset; + xfs_caddr_t offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; - xlog_recover_t *rhash[XLOG_RHASH_SIZE]; + struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); @@ -3557,7 +4073,7 @@ xlog_do_recovery_pass( * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ - if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb)) { + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a @@ -3566,16 +4082,17 @@ xlog_do_recovery_pass( hbp = xlog_get_bp(log, 1); if (!hbp) return ENOMEM; - if ((error = xlog_bread(log, tail_blk, 1, hbp))) + + error = xlog_bread(log, tail_blk, 1, hbp, &offset); + if (error) goto bread_err1; - offset = xlog_align(log, tail_blk, 1, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; - h_size = INT_GET(rhead->h_size, ARCH_CONVERT); - if ((INT_GET(rhead->h_version, ARCH_CONVERT) - & XLOG_VERSION_2) && + h_size = be32_to_cpu(rhead->h_size); + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; if (h_size % XLOG_HEADER_CYCLE_SIZE) @@ -3586,7 +4103,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3603,23 +4120,29 @@ xlog_do_recovery_pass( memset(rhash, 0, sizeof(rhash)); if (tail_blk <= head_blk) { for (blk_no = tail_blk; blk_no < head_blk; ) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; /* blocks in data section */ - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); - error = xlog_bread(log, blk_no + hblks, bblks, dbp); + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no + hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; + + error = xlog_unpack_data(rhead, offset, log); if (error) goto bread_err2; - offset = xlog_align(log, blk_no + hblks, bblks, dbp); - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3634,15 +4157,15 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = NULL; + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { /* Read header in one read */ - error = xlog_bread(log, blk_no, hblks, hbp); + error = xlog_bread(log, blk_no, hblks, hbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); } else { /* This LR is split across physical log end */ if (blk_no != log->l_logBBsize) { @@ -3650,12 +4173,13 @@ xlog_do_recovery_pass( ASSERT(blk_no <= INT_MAX); split_hblks = log->l_logBBsize - (int)blk_no; ASSERT(split_hblks > 0); - if ((error = xlog_bread(log, blk_no, - split_hblks, hbp))) + error = xlog_bread(log, blk_no, + split_hblks, hbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_hblks, hbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3668,18 +4192,12 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - bufaddr = XFS_BUF_PTR(hbp); - XFS_BUF_SET_PTR(hbp, - bufaddr + BBTOB(split_hblks), - BBTOB(hblks - split_hblks)); wrapped_hblks = hblks - split_hblks; - error = xlog_bread(log, 0, wrapped_hblks, hbp); + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); if (error) goto bread_err2; - XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks)); - if (!offset) - offset = xlog_align(log, 0, - wrapped_hblks, hbp); } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, @@ -3687,19 +4205,19 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; /* Read in data for log record */ if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp); + error = xlog_bread(log, blk_no, bblks, dbp, + &offset); if (error) goto bread_err2; - offset = xlog_align(log, blk_no, bblks, dbp); } else { /* This log record is split across the * physical end of log */ - offset = NULL; + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3709,12 +4227,13 @@ xlog_do_recovery_pass( split_bblks = log->l_logBBsize - (int)blk_no; ASSERT(split_bblks > 0); - if ((error = xlog_bread(log, blk_no, - split_bblks, dbp))) + error = xlog_bread(log, blk_no, + split_bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, - split_bblks, dbp); } + /* * Note: this black magic still works with * large sector sizes (non-512) only because: @@ -3727,21 +4246,20 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - bufaddr = XFS_BUF_PTR(dbp); - XFS_BUF_SET_PTR(dbp, - bufaddr + BBTOB(split_bblks), - BBTOB(bblks - split_bblks)); - if ((error = xlog_bread(log, wrapped_hblks, - bblks - split_bblks, dbp))) + error = xlog_bread_offset(log, 0, + bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); + if (error) goto bread_err2; - XFS_BUF_SET_PTR(dbp, bufaddr, h_size); - if (!offset) - offset = xlog_align(log, wrapped_hblks, - bblks - split_bblks, dbp); } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3751,20 +4269,28 @@ xlog_do_recovery_pass( /* read first part of physical log */ while (blk_no < head_blk) { - if ((error = xlog_bread(log, blk_no, hblks, hbp))) + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no, hblks, hbp); + rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, blk_no); if (error) goto bread_err2; - bblks = (int)BTOBB(INT_GET(rhead->h_len, ARCH_CONVERT)); - if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) + + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) goto bread_err2; - offset = xlog_align(log, blk_no+hblks, bblks, dbp); - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3792,11 +4318,11 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error; + int error, i; ASSERT(head_blk != tail_blk); @@ -3804,15 +4330,16 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = - (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(xfs_buf_cancel_t*), + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; } @@ -3827,12 +4354,11 @@ xlog_do_log_recovery( int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(log->l_buf_cancel_table[i] == NULL); + ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ - kmem_free(log->l_buf_cancel_table, - XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*)); + kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; return error; @@ -3843,7 +4369,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3855,11 +4381,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - XFS_bflush(log->l_mp->m_ddev_targp); /* * If IO errors happened during recovery, bail out. @@ -3881,15 +4404,24 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); + ASSERT(!(XFS_BUF_ISWRITE(bp))); XFS_BUF_READ(bp); - xfsbdstrat(log->l_mp, bp); - if ((error = xfs_iowait(bp))) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + XFS_BUF_UNASYNC(bp); + bp->b_ops = &xfs_sb_buf_ops; + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; @@ -3897,11 +4429,14 @@ xlog_do_recover( /* Convert superblock from on-disk format */ sbp = &log->l_mp->m_sb; - xfs_xlatesb(XFS_BUF_TO_SBP(bp), sbp, 1, XFS_SB_ALL_BITS); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); - ASSERT(XFS_SB_GOOD_VERSION(sbp)); + ASSERT(xfs_sb_good_version(sbp)); xfs_buf_relse(bp); + /* We've re-read the superblock so re-initialize per-cpu counters */ + xfs_icsb_reinit_counters(log->l_mp); + xlog_recover_check_summary(log); /* Normal transactions can now occur */ @@ -3916,7 +4451,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3937,15 +4472,32 @@ xlog_recover( * under the vfs layer, so we can get away with it unless * the device itself is read-only, in which case we fail. */ - if ((error = xfs_dev_is_read_only(log->l_mp, - "recovery required"))) { + if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { return error; } - cmn_err(CE_NOTE, - "Starting XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); log->l_flags |= XLOG_RECOVERY_NEEDED; @@ -3964,8 +4516,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log, - int mfsi_flags) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3976,31 +4527,30 @@ xlog_recover_finish( * rather than accepting new requests. */ if (log->l_flags & XLOG_RECOVERY_NEEDED) { - xlog_recover_process_efis(log); + int error; + error = xlog_recover_process_efis(log); + if (error) { + xfs_alert(log->l_mp, "Failed to recover EFIs"); + return error; + } /* * Sync the log to get all the EFIs out of the AIL. * This isn't absolutely necessary, but it helps in * case the unlink transactions would have problems * pushing the EFIs out of the way. */ - xfs_log_force(log->l_mp, (xfs_lsn_t)0, - (XFS_LOG_FORCE | XFS_LOG_SYNC)); + xfs_log_force(log->l_mp, XFS_LOG_SYNC); - if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) { - xlog_recover_process_iunlinks(log); - } + xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); - cmn_err(CE_NOTE, - "Ending XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { - cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", - log->l_mp->m_fsname); + xfs_info(log->l_mp, "Ending clean mount"); } return 0; } @@ -4013,23 +4563,17 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; - xfs_agi_t *agip; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_daddr_t agfdaddr; - xfs_daddr_t agidaddr; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; __uint64_t ifree; + int error; mp = log->l_mp; @@ -4037,62 +4581,28 @@ xlog_recover_check_summary( itotal = 0LL; ifree = 0LL; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { - agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)); - agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr, - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agfbp)) { - xfs_ioerror_alert("xlog_recover_check_summary(agf)", - mp, agfbp, agfdaddr); + error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); + if (error) { + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); + } else { + agfp = XFS_BUF_TO_AGF(agfbp); + freeblks += be32_to_cpu(agfp->agf_freeblks) + + be32_to_cpu(agfp->agf_flcount); + xfs_buf_relse(agfbp); } - agfp = XFS_BUF_TO_AGF(agfbp); - ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum)); - ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum))); - ASSERT(be32_to_cpu(agfp->agf_seqno) == agno); - - freeblks += be32_to_cpu(agfp->agf_freeblks) + - be32_to_cpu(agfp->agf_flcount); - xfs_buf_relse(agfbp); - - agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)); - agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr, - XFS_FSS_TO_BB(mp, 1), 0); - if (XFS_BUF_ISERROR(agibp)) { - xfs_ioerror_alert("xlog_recover_check_summary(agi)", - mp, agibp, agidaddr); + + error = xfs_read_agi(mp, NULL, agno, &agibp); + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { + struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); + + itotal += be32_to_cpu(agi->agi_count); + ifree += be32_to_cpu(agi->agi_freecount); + xfs_buf_relse(agibp); } - agip = XFS_BUF_TO_AGI(agibp); - ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum)); - ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum))); - ASSERT(be32_to_cpu(agip->agi_seqno) == agno); - - itotal += be32_to_cpu(agip->agi_count); - ifree += be32_to_cpu(agip->agi_freecount); - xfs_buf_relse(agibp); } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_xlatesb(XFS_BUF_TO_SBP(sbbp), sbp, 1, XFS_SB_ALL_BITS); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h index b2254555530..1c55ccbb379 100644 --- a/fs/xfs/xfs_log_recover.h +++ b/fs/xfs/xfs_log_recover.h @@ -28,29 +28,28 @@ #define XLOG_RHASH(tid) \ ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) -#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) +#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1) /* * item headers are in ri_buf[0]. Additional buffers follow. */ typedef struct xlog_recover_item { - struct xlog_recover_item *ri_next; - struct xlog_recover_item *ri_prev; - int ri_type; - int ri_cnt; /* count of regions found */ - int ri_total; /* total regions */ - xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ + struct list_head ri_list; + int ri_type; + int ri_cnt; /* count of regions found */ + int ri_total; /* total regions */ + xfs_log_iovec_t *ri_buf; /* ptr to regions buffer */ } xlog_recover_item_t; struct xlog_tid; typedef struct xlog_recover { - struct xlog_recover *r_next; - xlog_tid_t r_log_tid; /* log's transaction id */ - xfs_trans_header_t r_theader; /* trans header for partial */ - int r_state; /* not needed */ - xfs_lsn_t r_lsn; /* xact lsn */ - xlog_recover_item_t *r_itemq; /* q for items */ + struct hlist_node r_list; + xlog_tid_t r_log_tid; /* log's transaction id */ + xfs_trans_header_t r_theader; /* trans header for partial */ + int r_state; /* not needed */ + xfs_lsn_t r_lsn; /* xact lsn */ + struct list_head r_itemq; /* q for items */ } xlog_recover_t; #define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c new file mode 100644 index 00000000000..ee7e0e80246 --- /dev/null +++ b/fs/xfs/xfs_log_rlimit.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2013 Jie Liu. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_trans_space.h" +#include "xfs_inode.h" +#include "xfs_da_btree.h" +#include "xfs_attr_leaf.h" +#include "xfs_bmap_btree.h" + +/* + * Calculate the maximum length in bytes that would be required for a local + * attribute value as large attributes out of line are not logged. + */ +STATIC int +xfs_log_calc_max_attrsetm_res( + struct xfs_mount *mp) +{ + int size; + int nblks; + + size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) - + MAXNAMELEN - 1; + nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); + nblks += XFS_B_TO_FSB(mp, size); + nblks += XFS_NEXTENTADD_SPACE_RES(mp, size, XFS_ATTR_FORK); + + return M_RES(mp)->tr_attrsetm.tr_logres + + M_RES(mp)->tr_attrsetrt.tr_logres * nblks; +} + +/* + * Iterate over the log space reservation table to figure out and return + * the maximum one in terms of the pre-calculated values which were done + * at mount time. + */ +STATIC void +xfs_log_get_max_trans_res( + struct xfs_mount *mp, + struct xfs_trans_res *max_resp) +{ + struct xfs_trans_res *resp; + struct xfs_trans_res *end_resp; + int log_space = 0; + int attr_space; + + attr_space = xfs_log_calc_max_attrsetm_res(mp); + + resp = (struct xfs_trans_res *)M_RES(mp); + end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1); + for (; resp < end_resp; resp++) { + int tmp = resp->tr_logcount > 1 ? + resp->tr_logres * resp->tr_logcount : + resp->tr_logres; + if (log_space < tmp) { + log_space = tmp; + *max_resp = *resp; /* struct copy */ + } + } + + if (attr_space > log_space) { + *max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */ + max_resp->tr_logres = attr_space; + } +} + +/* + * Calculate the minimum valid log size for the given superblock configuration. + * Used to calculate the minimum log size at mkfs time, and to determine if + * the log is large enough or not at mount time. Returns the minimum size in + * filesystem block size units. + */ +int +xfs_log_calc_minimum_size( + struct xfs_mount *mp) +{ + struct xfs_trans_res tres = {0}; + int max_logres; + int min_logblks = 0; + int lsunit = 0; + + xfs_log_get_max_trans_res(mp, &tres); + + max_logres = xfs_log_calc_unit_res(mp, tres.tr_logres); + if (tres.tr_logcount > 1) + max_logres *= tres.tr_logcount; + + if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + lsunit = BTOBB(mp->m_sb.sb_logsunit); + + /* + * Two factors should be taken into account for calculating the minimum + * log space. + * 1) The fundamental limitation is that no single transaction can be + * larger than half size of the log. + * + * From mkfs.xfs, this is considered by the XFS_MIN_LOG_FACTOR + * define, which is set to 3. That means we can definitely fit + * maximally sized 2 transactions in the log. We'll use this same + * value here. + * + * 2) If the lsunit option is specified, a transaction requires 2 LSU + * for the reservation because there are two log writes that can + * require padding - the transaction data and the commit record which + * are written separately and both can require padding to the LSU. + * Consider that we can have an active CIL reservation holding 2*LSU, + * but the CIL is not over a push threshold, in this case, if we + * don't have enough log space for at one new transaction, which + * includes another 2*LSU in the reservation, we will run into dead + * loop situation in log space grant procedure. i.e. + * xlog_grant_head_wait(). + * + * Hence the log size needs to be able to contain two maximally sized + * and padded transactions, which is (2 * (2 * LSU + maxlres)). + * + * Also, the log size should be a multiple of the log stripe unit, round + * it up to lsunit boundary if lsunit is specified. + */ + if (lsunit) { + min_logblks = roundup_64(BTOBB(max_logres), lsunit) + + 2 * lsunit; + } else + min_logblks = BTOBB(max_logres) + 2 * BBSIZE; + min_logblks *= XFS_MIN_LOG_FACTOR; + + return XFS_BB_TO_FSB(mp, min_logblks); +} diff --git a/fs/xfs/xfs_mac.h b/fs/xfs/xfs_mac.h deleted file mode 100644 index 18e0e98e03d..00000000000 --- a/fs/xfs/xfs_mac.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_MAC_H__ -#define __XFS_MAC_H__ - -/* - * Mandatory Access Control - * - * Layout of a composite MAC label: - * ml_list contains the list of categories (MSEN) followed by the list of - * divisions (MINT). This is actually a header for the data structure which - * will have an ml_list with more than one element. - * - * ------------------------------- - * | ml_msen_type | ml_mint_type | - * ------------------------------- - * | ml_level | ml_grade | - * ------------------------------- - * | ml_catcount | - * ------------------------------- - * | ml_divcount | - * ------------------------------- - * | category 1 | - * | . . . | - * | category N | (where N = ml_catcount) - * ------------------------------- - * | division 1 | - * | . . . | - * | division M | (where M = ml_divcount) - * ------------------------------- - */ -#define XFS_MAC_MAX_SETS 250 -typedef struct xfs_mac_label { - __uint8_t ml_msen_type; /* MSEN label type */ - __uint8_t ml_mint_type; /* MINT label type */ - __uint8_t ml_level; /* Hierarchical level */ - __uint8_t ml_grade; /* Hierarchical grade */ - __uint16_t ml_catcount; /* Category count */ - __uint16_t ml_divcount; /* Division count */ - /* Category set, then Division set */ - __uint16_t ml_list[XFS_MAC_MAX_SETS]; -} xfs_mac_label_t; - -/* MSEN label type names. Choose an upper case ASCII character. */ -#define XFS_MSEN_ADMIN_LABEL 'A' /* Admin: low<admin != tcsec<high */ -#define XFS_MSEN_EQUAL_LABEL 'E' /* Wildcard - always equal */ -#define XFS_MSEN_HIGH_LABEL 'H' /* System High - always dominates */ -#define XFS_MSEN_MLD_HIGH_LABEL 'I' /* System High, multi-level dir */ -#define XFS_MSEN_LOW_LABEL 'L' /* System Low - always dominated */ -#define XFS_MSEN_MLD_LABEL 'M' /* TCSEC label on a multi-level dir */ -#define XFS_MSEN_MLD_LOW_LABEL 'N' /* System Low, multi-level dir */ -#define XFS_MSEN_TCSEC_LABEL 'T' /* TCSEC label */ -#define XFS_MSEN_UNKNOWN_LABEL 'U' /* unknown label */ - -/* MINT label type names. Choose a lower case ASCII character. */ -#define XFS_MINT_BIBA_LABEL 'b' /* Dual of a TCSEC label */ -#define XFS_MINT_EQUAL_LABEL 'e' /* Wildcard - always equal */ -#define XFS_MINT_HIGH_LABEL 'h' /* High Grade - always dominates */ -#define XFS_MINT_LOW_LABEL 'l' /* Low Grade - always dominated */ - -/* On-disk XFS extended attribute names */ -#define SGI_MAC_FILE "SGI_MAC_FILE" -#define SGI_MAC_FILE_SIZE (sizeof(SGI_MAC_FILE)-1) - - -#ifdef __KERNEL__ - -#ifdef CONFIG_FS_POSIX_MAC - -/* NOT YET IMPLEMENTED */ - -#define MACEXEC 00100 -#define MACWRITE 00200 -#define MACREAD 00400 - -struct xfs_inode; -extern int xfs_mac_iaccess(struct xfs_inode *, mode_t, cred_t *); - -#define _MAC_XFS_IACCESS(i,m,c) (xfs_mac_iaccess(i,m,c)) -#define _MAC_VACCESS(v,c,m) (xfs_mac_vaccess(v,c,m)) -#define _MAC_EXISTS xfs_mac_vhaslabel - -#else -#define _MAC_XFS_IACCESS(i,m,c) (0) -#define _MAC_VACCESS(v,c,m) (0) -#define _MAC_EXISTS (NULL) -#endif - -#endif /* __KERNEL__ */ - -#endif /* __XFS_MAC_H__ */ diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c new file mode 100644 index 00000000000..63ca2f0420b --- /dev/null +++ b/fs/xfs/xfs_message.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" + +/* + * XFS logging functions + */ +static void +__xfs_printk( + const char *level, + const struct xfs_mount *mp, + struct va_format *vaf) +{ + if (mp && mp->m_fsname) { + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } + printk("%sXFS: %pV\n", level, vaf); +} + +#define define_xfs_printk_level(func, kern_level) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __xfs_printk(kern_level, mp, &vaf); \ + va_end(args); \ +} \ + +define_xfs_printk_level(xfs_emerg, KERN_EMERG); +define_xfs_printk_level(xfs_alert, KERN_ALERT); +define_xfs_printk_level(xfs_crit, KERN_CRIT); +define_xfs_printk_level(xfs_err, KERN_ERR); +define_xfs_printk_level(xfs_warn, KERN_WARNING); +define_xfs_printk_level(xfs_notice, KERN_NOTICE); +define_xfs_printk_level(xfs_info, KERN_INFO); +#ifdef DEBUG +define_xfs_printk_level(xfs_debug, KERN_DEBUG); +#endif + +void +xfs_alert_tag( + const struct xfs_mount *mp, + int panic_tag, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + xfs_alert(mp, "Transforming an alert into a BUG."); + do_panic = 1; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); + + BUG_ON(do_panic); +} + +void +asswarn(char *expr, char *file, int line) +{ + xfs_warn(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + WARN_ON(1); +} + +void +assfail(char *expr, char *file, int line) +{ + xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +void +xfs_hex_dump(void *p, int length) +{ + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); +} diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h new file mode 100644 index 00000000000..85401155750 --- /dev/null +++ b/fs/xfs/xfs_message.h @@ -0,0 +1,64 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern __printf(2, 3) +void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(3, 4) +void xfs_alert_tag(const struct xfs_mount *mp, int tag, const char *fmt, ...); +extern __printf(2, 3) +void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_err(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...); +extern __printf(2, 3) +void xfs_info(const struct xfs_mount *mp, const char *fmt, ...); + +#ifdef DEBUG +extern __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...); +#else +static inline __printf(2, 3) +void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +#define xfs_printk_ratelimited(func, dev, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + func(dev, fmt, ##__VA_ARGS__); \ +} while (0) + +#define xfs_emerg_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_emerg, dev, fmt, ##__VA_ARGS__) +#define xfs_alert_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_alert, dev, fmt, ##__VA_ARGS__) +#define xfs_crit_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_crit, dev, fmt, ##__VA_ARGS__) +#define xfs_err_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_err, dev, fmt, ##__VA_ARGS__) +#define xfs_warn_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_warn, dev, fmt, ##__VA_ARGS__) +#define xfs_notice_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_notice, dev, fmt, ##__VA_ARGS__) +#define xfs_info_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_info, dev, fmt, ##__VA_ARGS__) +#define xfs_debug_ratelimited(dev, fmt, ...) \ + xfs_printk_ratelimited(xfs_debug, dev, fmt, ##__VA_ARGS__) + +extern void assfail(char *expr, char *f, int l); +extern void asswarn(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9dfae18d995..3507cd0ec40 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -17,454 +17,254 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_btree.h" +#include "xfs_dir2.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_rtalloc.h" #include "xfs_bmap.h" +#include "xfs_trans.h" +#include "xfs_trans_priv.h" +#include "xfs_log.h" #include "xfs_error.h" -#include "xfs_rw.h" #include "xfs_quota.h" #include "xfs_fsops.h" - -STATIC void xfs_mount_log_sbunit(xfs_mount_t *, __int64_t); -STATIC int xfs_uuid_mount(xfs_mount_t *); -STATIC void xfs_uuid_unmount(xfs_mount_t *mp); -STATIC void xfs_unmountfs_wait(xfs_mount_t *); +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" #ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); -STATIC void xfs_icsb_sync_counters(xfs_mount_t *); -STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC int xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); - +STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, + int); +STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, + int); +STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else -#define xfs_icsb_destroy_counters(mp) do { } while (0) #define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_sync_counters(mp) do { } while (0) -#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) -#define xfs_icsb_modify_counters_locked(mp, a, b, c) do { } while (0) - +#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #endif -static const struct { - short offset; - short type; /* 0 = integer - * 1 = binary / string (no translation) - */ -} xfs_sb_info[] = { - { offsetof(xfs_sb_t, sb_magicnum), 0 }, - { offsetof(xfs_sb_t, sb_blocksize), 0 }, - { offsetof(xfs_sb_t, sb_dblocks), 0 }, - { offsetof(xfs_sb_t, sb_rblocks), 0 }, - { offsetof(xfs_sb_t, sb_rextents), 0 }, - { offsetof(xfs_sb_t, sb_uuid), 1 }, - { offsetof(xfs_sb_t, sb_logstart), 0 }, - { offsetof(xfs_sb_t, sb_rootino), 0 }, - { offsetof(xfs_sb_t, sb_rbmino), 0 }, - { offsetof(xfs_sb_t, sb_rsumino), 0 }, - { offsetof(xfs_sb_t, sb_rextsize), 0 }, - { offsetof(xfs_sb_t, sb_agblocks), 0 }, - { offsetof(xfs_sb_t, sb_agcount), 0 }, - { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, - { offsetof(xfs_sb_t, sb_logblocks), 0 }, - { offsetof(xfs_sb_t, sb_versionnum), 0 }, - { offsetof(xfs_sb_t, sb_sectsize), 0 }, - { offsetof(xfs_sb_t, sb_inodesize), 0 }, - { offsetof(xfs_sb_t, sb_inopblock), 0 }, - { offsetof(xfs_sb_t, sb_fname[0]), 1 }, - { offsetof(xfs_sb_t, sb_blocklog), 0 }, - { offsetof(xfs_sb_t, sb_sectlog), 0 }, - { offsetof(xfs_sb_t, sb_inodelog), 0 }, - { offsetof(xfs_sb_t, sb_inopblog), 0 }, - { offsetof(xfs_sb_t, sb_agblklog), 0 }, - { offsetof(xfs_sb_t, sb_rextslog), 0 }, - { offsetof(xfs_sb_t, sb_inprogress), 0 }, - { offsetof(xfs_sb_t, sb_imax_pct), 0 }, - { offsetof(xfs_sb_t, sb_icount), 0 }, - { offsetof(xfs_sb_t, sb_ifree), 0 }, - { offsetof(xfs_sb_t, sb_fdblocks), 0 }, - { offsetof(xfs_sb_t, sb_frextents), 0 }, - { offsetof(xfs_sb_t, sb_uquotino), 0 }, - { offsetof(xfs_sb_t, sb_gquotino), 0 }, - { offsetof(xfs_sb_t, sb_qflags), 0 }, - { offsetof(xfs_sb_t, sb_flags), 0 }, - { offsetof(xfs_sb_t, sb_shared_vn), 0 }, - { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, - { offsetof(xfs_sb_t, sb_unit), 0 }, - { offsetof(xfs_sb_t, sb_width), 0 }, - { offsetof(xfs_sb_t, sb_dirblklog), 0 }, - { offsetof(xfs_sb_t, sb_logsectlog), 0 }, - { offsetof(xfs_sb_t, sb_logsectsize),0 }, - { offsetof(xfs_sb_t, sb_logsunit), 0 }, - { offsetof(xfs_sb_t, sb_features2), 0 }, - { sizeof(xfs_sb_t), 0 } -}; +static DEFINE_MUTEX(xfs_uuid_table_mutex); +static int xfs_uuid_table_size; +static uuid_t *xfs_uuid_table; /* - * Return a pointer to an initialized xfs_mount structure. + * See if the UUID is unique among mounted XFS filesystems. + * Mount fails if UUID is nil or a FS with the same UUID is already mounted. */ -xfs_mount_t * -xfs_mount_init(void) +STATIC int +xfs_uuid_mount( + struct xfs_mount *mp) { - xfs_mount_t *mp; + uuid_t *uuid = &mp->m_sb.sb_uuid; + int hole, i; - mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP); + if (mp->m_flags & XFS_MOUNT_NOUUID) + return 0; - if (xfs_icsb_init_counters(mp)) { - mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + if (uuid_is_nil(uuid)) { + xfs_warn(mp, "Filesystem has nil UUID - can't mount"); + return XFS_ERROR(EINVAL); } - AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); - spinlock_init(&mp->m_sb_lock, "xfs_sb"); - mutex_init(&mp->m_ilock); - initnsema(&mp->m_growlock, 1, "xfs_grow"); - /* - * Initialize the AIL. - */ - xfs_trans_ail_init(mp); + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0, hole = -1; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) { + hole = i; + continue; + } + if (uuid_equal(uuid, &xfs_uuid_table[i])) + goto out_duplicate; + } - atomic_set(&mp->m_active_trans, 0); + if (hole < 0) { + xfs_uuid_table = kmem_realloc(xfs_uuid_table, + (xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table), + xfs_uuid_table_size * sizeof(*xfs_uuid_table), + KM_SLEEP); + hole = xfs_uuid_table_size++; + } + xfs_uuid_table[hole] = *uuid; + mutex_unlock(&xfs_uuid_table_mutex); + + return 0; - return mp; + out_duplicate: + mutex_unlock(&xfs_uuid_table_mutex); + xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); + return XFS_ERROR(EINVAL); } -/* - * Free up the resources associated with a mount structure. Assume that - * the structure was initially zeroed, so we can tell which fields got - * initialized. - */ -void -xfs_mount_free( - xfs_mount_t *mp, - int remove_bhv) +STATIC void +xfs_uuid_unmount( + struct xfs_mount *mp) { - if (mp->m_ihash) - xfs_ihash_free(mp); - if (mp->m_chash) - xfs_chash_free(mp); - - if (mp->m_perag) { - int agno; - - for (agno = 0; agno < mp->m_maxagi; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * - XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, - sizeof(xfs_perag_t) * mp->m_sb.sb_agcount); - } + uuid_t *uuid = &mp->m_sb.sb_uuid; + int i; - AIL_LOCK_DESTROY(&mp->m_ail_lock); - spinlock_destroy(&mp->m_sb_lock); - mutex_destroy(&mp->m_ilock); - freesema(&mp->m_growlock); - if (mp->m_quotainfo) - XFS_QM_DONE(mp); - - if (mp->m_fsname != NULL) - kmem_free(mp->m_fsname, mp->m_fsname_len); - if (mp->m_rtname != NULL) - kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1); - if (mp->m_logname != NULL) - kmem_free(mp->m_logname, strlen(mp->m_logname) + 1); - - if (remove_bhv) { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - - bhv_remove_all_vfsops(vfsp, 0); - VFS_REMOVEBHV(vfsp, &mp->m_bhv); - } + if (mp->m_flags & XFS_MOUNT_NOUUID) + return; - xfs_icsb_destroy_counters(mp); - kmem_free(mp, sizeof(xfs_mount_t)); + mutex_lock(&xfs_uuid_table_mutex); + for (i = 0; i < xfs_uuid_table_size; i++) { + if (uuid_is_nil(&xfs_uuid_table[i])) + continue; + if (!uuid_equal(uuid, &xfs_uuid_table[i])) + continue; + memset(&xfs_uuid_table[i], 0, sizeof(uuid_t)); + break; + } + ASSERT(i < xfs_uuid_table_size); + mutex_unlock(&xfs_uuid_table_mutex); } -/* - * Check the validity of the SB found. - */ -STATIC int -xfs_mount_validate_sb( - xfs_mount_t *mp, - xfs_sb_t *sbp, - int flags) +STATIC void +__xfs_free_perag( + struct rcu_head *head) { - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ - if (sbp->sb_magicnum != XFS_SB_MAGIC) { - xfs_fs_mount_cmn_err(flags, "bad magic number"); - return XFS_ERROR(EWRONGFS); - } - - if (!XFS_SB_GOOD_VERSION(sbp)) { - xfs_fs_mount_cmn_err(flags, "bad version"); - return XFS_ERROR(EWRONGFS); - } + struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head); - if (unlikely( - sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - xfs_fs_mount_cmn_err(flags, - "filesystem is marked as having an external log; " - "specify logdev on the\nmount command line."); - return XFS_ERROR(EINVAL); - } - - if (unlikely( - sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - xfs_fs_mount_cmn_err(flags, - "filesystem is marked as having an internal log; " - "do not specify logdev on\nthe mount command line."); - return XFS_ERROR(EINVAL); - } + ASSERT(atomic_read(&pag->pag_ref) == 0); + kmem_free(pag); +} - /* - * More sanity checking. These were stolen directly from - * xfs_repair. - */ - if (unlikely( - sbp->sb_agcount <= 0 || - sbp->sb_sectsize < XFS_MIN_SECTORSIZE || - sbp->sb_sectsize > XFS_MAX_SECTORSIZE || - sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || - sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || - sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || - sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || - sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || - sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || - sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || - sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || - sbp->sb_inodelog < XFS_DINODE_MIN_LOG || - sbp->sb_inodelog > XFS_DINODE_MAX_LOG || - (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || - (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || - (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { - xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); - return XFS_ERROR(EFSCORRUPTED); - } +/* + * Free up the per-ag resources associated with the mount structure. + */ +STATIC void +xfs_free_perag( + xfs_mount_t *mp) +{ + xfs_agnumber_t agno; + struct xfs_perag *pag; - /* - * Sanity check AG count, size fields against data size field - */ - if (unlikely( - sbp->sb_dblocks == 0 || - sbp->sb_dblocks > - (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || - sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * - sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { - xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); - return XFS_ERROR(EFSCORRUPTED); + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + ASSERT(atomic_read(&pag->pag_ref) == 0); + call_rcu(&pag->rcu_head, __xfs_free_perag); } +} +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); #if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ - if (unlikely( - (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX || - (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) { + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return EFBIG; #else /* Limited by UINT_MAX of sectors */ - if (unlikely( - (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || - (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { + if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) + return EFBIG; #endif - xfs_fs_mount_cmn_err(flags, - "file system too large to be mounted on this system."); - return XFS_ERROR(E2BIG); - } - - if (unlikely(sbp->sb_inprogress)) { - xfs_fs_mount_cmn_err(flags, "file system busy"); - return XFS_ERROR(EFSCORRUPTED); - } - - /* - * Version 1 directory format has never worked on Linux. - */ - if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { - xfs_fs_mount_cmn_err(flags, - "file system using version 1 directory format"); - return XFS_ERROR(ENOSYS); - } - - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_fs_mount_cmn_err(flags, - "file system with blocksize %d bytes", - sbp->sb_blocksize); - xfs_fs_mount_cmn_err(flags, - "only pagesize (%ld) or less will currently work.", - PAGE_SIZE); - return XFS_ERROR(ENOSYS); - } - return 0; } -xfs_agnumber_t +int xfs_initialize_perag( - bhv_vfs_t *vfs, xfs_mount_t *mp, - xfs_agnumber_t agcount) + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) { - xfs_agnumber_t index, max_metadata; + xfs_agnumber_t index; + xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; xfs_ino_t ino; xfs_sb_t *sbp = &mp->m_sb; - xfs_ino_t max_inum = XFS_MAXINUMBER_32; + int error = -ENOMEM; - /* Check to see if the filesystem can overflow 32 bit inodes */ - agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); - ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); - - /* Clear the mount flag if no inode can overflow 32 bits - * on this filesystem, or if specifically requested.. + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. */ - if ((vfs->vfs_flag & VFS_32BITINODES) && ino > max_inum) { - mp->m_flags |= XFS_MOUNT_32BITINODES; - } else { - mp->m_flags &= ~XFS_MOUNT_32BITINODES; - } - - /* If we can overflow then setup the ag headers accordingly */ - if (mp->m_flags & XFS_MOUNT_32BITINODES) { - /* Calculate how much should be reserved for inodes to - * meet the max inode percentage. - */ - if (mp->m_maxicount) { - __uint64_t icount; - - icount = sbp->sb_dblocks * sbp->sb_imax_pct; - do_div(icount, 100); - icount += sbp->sb_agblocks - 1; - do_div(icount, sbp->sb_agblocks); - max_metadata = icount; - } else { - max_metadata = agcount; + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; } - for (index = 0; index < agcount; index++) { - ino = XFS_AGINO_TO_INO(mp, index, agino); - if (ino > max_inum) { - index++; - break; - } - - /* This ag is preferred for inodes */ - pag = &mp->m_perag[index]; - pag->pagi_inodeok = 1; - if (index < max_metadata) - pag->pagf_metadata = 1; - } - } else { - /* Setup default behavior for smaller filesystems */ - for (index = 0; index < agcount; index++) { - pag = &mp->m_perag[index]; - pag->pagi_inodeok = 1; + if (!first_initialised) + first_initialised = index; + + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + goto out_unwind; + pag->pag_agno = index; + pag->pag_mount = mp; + spin_lock_init(&pag->pag_ici_lock); + mutex_init(&pag->pag_ici_reclaim_lock); + INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); + spin_lock_init(&pag->pag_buf_lock); + pag->pag_buf_tree = RB_ROOT; + + if (radix_tree_preload(GFP_NOFS)) + goto out_unwind; + + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); } - return index; -} - -/* - * xfs_xlatesb - * - * data - on disk version of sb - * sb - a superblock - * dir - conversion direction: <0 - convert sb to buf - * >0 - convert buf to sb - * fields - which fields to copy (bitmask) - */ -void -xfs_xlatesb( - void *data, - xfs_sb_t *sb, - int dir, - __int64_t fields) -{ - xfs_caddr_t buf_ptr; - xfs_caddr_t mem_ptr; - xfs_sb_field_t f; - int first; - int size; - - ASSERT(dir); - ASSERT(fields); - if (!fields) - return; - - buf_ptr = (xfs_caddr_t)data; - mem_ptr = (xfs_caddr_t)sb; + /* + * If we mount with the inode64 option, or no inode overflows + * the legacy 32-bit address space clear the inode32 option. + */ + agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); + ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); - while (fields) { - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - first = xfs_sb_info[f].offset; - size = xfs_sb_info[f + 1].offset - first; + if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) + mp->m_flags |= XFS_MOUNT_32BITINODES; + else + mp->m_flags &= ~XFS_MOUNT_32BITINODES; - ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + if (mp->m_flags & XFS_MOUNT_32BITINODES) + index = xfs_set_inode32(mp); + else + index = xfs_set_inode64(mp); - if (size == 1 || xfs_sb_info[f].type == 1) { - if (dir > 0) { - memcpy(mem_ptr + first, buf_ptr + first, size); - } else { - memcpy(buf_ptr + first, mem_ptr + first, size); - } - } else { - switch (size) { - case 2: - INT_XLATE(*(__uint16_t*)(buf_ptr+first), - *(__uint16_t*)(mem_ptr+first), - dir, ARCH_CONVERT); - break; - case 4: - INT_XLATE(*(__uint32_t*)(buf_ptr+first), - *(__uint32_t*)(mem_ptr+first), - dir, ARCH_CONVERT); - break; - case 8: - INT_XLATE(*(__uint64_t*)(buf_ptr+first), - *(__uint64_t*)(mem_ptr+first), dir, ARCH_CONVERT); - break; - default: - ASSERT(0); - } - } + if (maxagi) + *maxagi = index; + return 0; - fields &= ~(1LL << f); +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); } + return error; } /* @@ -473,256 +273,143 @@ xfs_xlatesb( * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp, int flags) +xfs_readsb( + struct xfs_mount *mp, + int flags) { unsigned int sector_size; - unsigned int extra_flags; - xfs_buf_t *bp; - xfs_sb_t *sbp; + struct xfs_buf *bp; + struct xfs_sb *sbp = &mp->m_sb; int error; + int loud = !(flags & XFS_MFSI_QUIET); + const struct xfs_buf_ops *buf_ops; ASSERT(mp->m_sb_bp == NULL); ASSERT(mp->m_ddev_targp != NULL); /* + * For the initial read, we must guess at the sector + * size based on the block device. It's enough to + * get the sb_sectsize out of the superblock and + * then reread with the proper length. + * We don't verify it yet, because it may not be complete. + */ + sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); + buf_ops = NULL; + + /* * Allocate a (locked) buffer to hold the superblock. * This will be kept around at all times to optimize * access to the superblock. */ - sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; - - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); - if (!bp || XFS_BUF_ISERROR(bp)) { - xfs_fs_mount_cmn_err(flags, "SB read failed"); - error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; - goto fail; +reread: + bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, buf_ops); + if (!bp) { + if (loud) + xfs_warn(mp, "SB buffer read failed"); + return EIO; + } + if (bp->b_error) { + error = bp->b_error; + if (loud) + xfs_warn(mp, "SB validate failed with error %d.", error); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; + goto release_buf; } - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); /* * Initialize the mount structure from the superblock. - * But first do some basic consistency checking. */ - sbp = XFS_BUF_TO_SBP(bp); - xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); + xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); + xfs_sb_quota_from_disk(sbp); - error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); - if (error) { - xfs_fs_mount_cmn_err(flags, "SB validate failed"); - goto fail; + /* + * If we haven't validated the superblock, do so now before we try + * to check the sector size and reread the superblock appropriately. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + if (loud) + xfs_warn(mp, "Invalid superblock magic number"); + error = EINVAL; + goto release_buf; } /* * We must be able to do sector-sized and sector-aligned IO. */ - if (sector_size > mp->m_sb.sb_sectsize) { - xfs_fs_mount_cmn_err(flags, - "device supports only %u byte sectors (not %u)", - sector_size, mp->m_sb.sb_sectsize); + if (sector_size > sbp->sb_sectsize) { + if (loud) + xfs_warn(mp, "device supports %u byte sectors (not %u)", + sector_size, sbp->sb_sectsize); error = ENOSYS; - goto fail; + goto release_buf; } - /* - * If device sector size is smaller than the superblock size, - * re-read the superblock so the buffer is correctly sized. - */ - if (sector_size < mp->m_sb.sb_sectsize) { - XFS_BUF_UNMANAGE(bp); + if (buf_ops == NULL) { + /* + * Re-read the superblock so the buffer is correctly sized, + * and properly verified. + */ xfs_buf_relse(bp); - sector_size = mp->m_sb.sb_sectsize; - bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), extra_flags); - if (!bp || XFS_BUF_ISERROR(bp)) { - xfs_fs_mount_cmn_err(flags, "SB re-read failed"); - error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; - goto fail; - } - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + sector_size = sbp->sb_sectsize; + buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops; + goto reread; } - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + /* Initialize per-cpu counters */ + xfs_icsb_reinit_counters(mp); + + /* no need to be quiet anymore, so reset the buf ops */ + bp->b_ops = &xfs_sb_buf_ops; mp->m_sb_bp = bp; - xfs_buf_relse(bp); - ASSERT(XFS_BUF_VALUSEMA(bp) > 0); + xfs_buf_unlock(bp); return 0; - fail: - if (bp) { - XFS_BUF_UNMANAGE(bp); - xfs_buf_relse(bp); - } +release_buf: + xfs_buf_relse(bp); return error; } - /* - * xfs_mount_common - * - * Mount initialization code establishing various mount - * fields from the superblock associated with the given - * mount structure + * Update alignment values based on mount options and sb values */ -STATIC void -xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp) -{ - int i; - - mp->m_agfrotor = mp->m_agirotor = 0; - spinlock_init(&mp->m_agirotor_lock, "m_agirotor_lock"); - mp->m_maxagi = mp->m_sb.sb_agcount; - mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; - mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; - mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; - mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; - mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; - mp->m_litino = sbp->sb_inodesize - - ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t)); - mp->m_blockmask = sbp->sb_blocksize - 1; - mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; - mp->m_blockwmask = mp->m_blockwsize - 1; - INIT_LIST_HEAD(&mp->m_del_inodes); - - /* - * Setup for attributes, in case they get created. - * This value is for inodes getting attributes for the first time, - * the per-inode value is for old attribute values. - */ - ASSERT(sbp->sb_inodesize >= 256 && sbp->sb_inodesize <= 2048); - switch (sbp->sb_inodesize) { - case 256: - mp->m_attroffset = XFS_LITINO(mp) - - XFS_BMDR_SPACE_CALC(MINABTPTRS); - break; - case 512: - case 1024: - case 2048: - mp->m_attroffset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS); - break; - default: - ASSERT(0); - } - ASSERT(mp->m_attroffset < XFS_LITINO(mp)); - - for (i = 0; i < 2; i++) { - mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_alloc, i == 0); - mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_alloc, i == 0); - } - for (i = 0; i < 2; i++) { - mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_bmbt, i == 0); - mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_bmbt, i == 0); - } - for (i = 0; i < 2; i++) { - mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize, - xfs_inobt, i == 0); - mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize, - xfs_inobt, i == 0); - } - - mp->m_bsize = XFS_FSB_TO_BB(mp, 1); - mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, - sbp->sb_inopblock); - mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; -} -/* - * xfs_mountfs - * - * This function does the following on an initial mount of a file system: - * - reads the superblock from disk and init the mount struct - * - if we're a 32-bit kernel, do a size check on the superblock - * so we don't mount terabyte filesystems - * - init mount struct realtime fields - * - allocate inode hash table for fs - * - init directory manager - * - perform recovery and init the log manager - */ -int -xfs_mountfs( - bhv_vfs_t *vfsp, - xfs_mount_t *mp, - int mfsi_flags) +STATIC int +xfs_update_alignment(xfs_mount_t *mp) { - xfs_buf_t *bp; xfs_sb_t *sbp = &(mp->m_sb); - xfs_inode_t *rip; - bhv_vnode_t *rvp = NULL; - int readio_log, writeio_log; - xfs_daddr_t d; - __uint64_t ret64; - __int64_t update_flags; - uint quotamount, quotaflags; - int agno; - int uuid_mounted = 0; - int error = 0; - - if (mp->m_sb_bp == NULL) { - if ((error = xfs_readsb(mp, mfsi_flags))) { - return error; - } - } - xfs_mount_common(mp, sbp); - /* - * Check if sb_agblocks is aligned at stripe boundary - * If sb_agblocks is NOT aligned turn off m_dalign since - * allocator alignment is within an ag, therefore ag has - * to be aligned at stripe boundary. - */ - update_flags = 0LL; - if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) { + if (mp->m_dalign) { /* * If stripe unit and stripe width are not multiples * of the fs blocksize turn off alignment. */ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || (BBTOB(mp->m_swidth) & mp->m_blockmask)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - cmn_err(CE_WARN, - "XFS: alignment check 1 failed"); - error = XFS_ERROR(EINVAL); - goto error1; - } - mp->m_dalign = mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. blocksize(%d)", + sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } else { /* * Convert the stripe unit and width to FSBs. */ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign); if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) { - if (mp->m_flags & XFS_MOUNT_RETERR) { - error = XFS_ERROR(EINVAL); - goto error1; - } - xfs_fs_cmn_err(CE_WARN, mp, -"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", - mp->m_dalign, mp->m_swidth, - sbp->sb_agblocks); - - mp->m_dalign = 0; - mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit/swidth vs. agsize(%d)", + sbp->sb_agblocks); + return XFS_ERROR(EINVAL); } else if (mp->m_dalign) { mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); } else { - if (mp->m_flags & XFS_MOUNT_RETERR) { - xfs_fs_cmn_err(CE_WARN, mp, -"stripe alignment turned off: sunit(%d) less than bsize(%d)", - mp->m_dalign, - mp->m_blockmask +1); - error = XFS_ERROR(EINVAL); - goto error1; - } - mp->m_swidth = 0; + xfs_warn(mp, + "alignment check failed: sunit(%d) less than bsize(%d)", + mp->m_dalign, sbp->sb_blocksize); + return XFS_ERROR(EINVAL); } } @@ -730,67 +417,65 @@ xfs_mountfs( * Update superblock with new values * and log changes */ - if (XFS_SB_VERSION_HASDALIGN(sbp)) { + if (xfs_sb_version_hasdalign(sbp)) { if (sbp->sb_unit != mp->m_dalign) { sbp->sb_unit = mp->m_dalign; - update_flags |= XFS_SB_UNIT; + mp->m_update_flags |= XFS_SB_UNIT; } if (sbp->sb_width != mp->m_swidth) { sbp->sb_width = mp->m_swidth; - update_flags |= XFS_SB_WIDTH; + mp->m_update_flags |= XFS_SB_WIDTH; } + } else { + xfs_warn(mp, + "cannot change alignment: superblock does not support data alignment"); + return XFS_ERROR(EINVAL); } } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - XFS_SB_VERSION_HASDALIGN(&mp->m_sb)) { + xfs_sb_version_hasdalign(&mp->m_sb)) { mp->m_dalign = sbp->sb_unit; mp->m_swidth = sbp->sb_width; } - xfs_alloc_compute_maxlevels(mp); - xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); - xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); - xfs_ialloc_compute_maxlevels(mp); + return 0; +} - if (sbp->sb_imax_pct) { - __uint64_t icount; +/* + * Set the maximum inode count for this filesystem + */ +STATIC void +xfs_set_maxicount(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + __uint64_t icount; - /* Make sure the maximum inode count is a multiple of the - * units we allocate inodes in. + if (sbp->sb_imax_pct) { + /* + * Make sure the maximum inode count is a multiple + * of the units we allocate inodes in. */ - icount = sbp->sb_dblocks * sbp->sb_imax_pct; do_div(icount, 100); do_div(icount, mp->m_ialloc_blks); mp->m_maxicount = (icount * mp->m_ialloc_blks) << sbp->sb_inopblog; - } else + } else { mp->m_maxicount = 0; - - mp->m_maxioffset = xfs_max_file_offset(sbp->sb_blocklog); - - /* - * XFS uses the uuid from the superblock as the unique - * identifier for fsid. We can not use the uuid from the volume - * since a single partition filesystem is identical to a single - * partition volume/filesystem. - */ - if ((mfsi_flags & XFS_MFSI_SECOND) == 0 && - (mp->m_flags & XFS_MOUNT_NOUUID) == 0) { - if (xfs_uuid_mount(mp)) { - error = XFS_ERROR(EINVAL); - goto error1; - } - uuid_mounted=1; - ret64 = uuid_hash64(&sbp->sb_uuid); - memcpy(&vfsp->vfs_fsid, &ret64, sizeof(ret64)); } +} + +/* + * Set the default minimum read and write sizes unless + * already specified in a mount option. + * We use smaller I/O sizes when the file system + * is being used for NFS service (wsync mount option). + */ +STATIC void +xfs_set_rw_sizes(xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + int readio_log, writeio_log; - /* - * Set the default minimum read and write sizes unless - * already specified in a mount option. - * We use smaller I/O sizes when the file system - * is being used for NFS service (wsync mount option). - */ if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) { if (mp->m_flags & XFS_MOUNT_WSYNC) { readio_log = XFS_WSYNC_READIO_LOG; @@ -804,16 +489,6 @@ xfs_mountfs( writeio_log = mp->m_writeio_log; } - /* - * Set the number of readahead buffers to use based on - * physical memory size. - */ - if (xfs_physmem <= 4096) /* <= 16MB */ - mp->m_nreadaheads = XFS_RW_NREADAHEAD_16MB; - else if (xfs_physmem <= 8192) /* <= 32MB */ - mp->m_nreadaheads = XFS_RW_NREADAHEAD_32MB; - else - mp->m_nreadaheads = XFS_RW_NREADAHEAD_K32; if (sbp->sb_blocklog > readio_log) { mp->m_readio_log = sbp->sb_blocklog; } else { @@ -826,21 +501,33 @@ xfs_mountfs( mp->m_writeio_log = writeio_log; } mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); +} - /* - * Set the inode cluster size based on the physical memory - * size. This may still be overridden by the file system - * block size if it is larger than the chosen cluster size. - */ - if (xfs_physmem <= btoc(32 * 1024 * 1024)) { /* <= 32 MB */ - mp->m_inode_cluster_size = XFS_INODE_SMALL_CLUSTER_SIZE; - } else { - mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; +/* + * precalculate the low space thresholds for dynamic speculative preallocation. + */ +void +xfs_set_low_space_thresholds( + struct xfs_mount *mp) +{ + int i; + + for (i = 0; i < XFS_LOWSP_MAX; i++) { + __uint64_t space = mp->m_sb.sb_dblocks; + + do_div(space, 100); + mp->m_low_space[i] = space * (i + 1); } - /* - * Set whether we're using inode alignment. - */ - if (XFS_SB_VERSION_HASALIGN(&mp->m_sb) && +} + + +/* + * Set whether we're using inode alignment. + */ +STATIC void +xfs_set_inoalignment(xfs_mount_t *mp) +{ + if (xfs_sb_version_hasalign(&mp->m_sb) && mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) mp->m_inoalign_mask = mp->m_sb.sb_inoalignmt - 1; @@ -855,152 +542,342 @@ xfs_mountfs( mp->m_sinoalign = mp->m_dalign; else mp->m_sinoalign = 0; - /* - * Check that the data (and log if separate) are an ok size. - */ +} + +/* + * Check that the data (and log if separate) is an ok size. + */ +STATIC int +xfs_check_sizes(xfs_mount_t *mp) +{ + xfs_buf_t *bp; + xfs_daddr_t d; + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { - cmn_err(CE_WARN, "XFS: size check 1 failed"); - error = XFS_ERROR(E2BIG); - goto error1; + xfs_warn(mp, "filesystem size mismatch detected"); + return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_ddev_targp, - d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp); - if (!error) { - xfs_buf_relse(bp); - } else { - cmn_err(CE_WARN, "XFS: size check 2 failed"); - if (error == ENOSPC) { - error = XFS_ERROR(E2BIG); - } - goto error1; + bp = xfs_buf_read_uncached(mp->m_ddev_targp, + d - XFS_FSS_TO_BB(mp, 1), + XFS_FSS_TO_BB(mp, 1), 0, NULL); + if (!bp) { + xfs_warn(mp, "last sector read failed"); + return EIO; } + xfs_buf_relse(bp); - if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) && - mp->m_logdev_targp != mp->m_ddev_targp) { + if (mp->m_logdev_targp != mp->m_ddev_targp) { d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { - cmn_err(CE_WARN, "XFS: size check 3 failed"); - error = XFS_ERROR(E2BIG); - goto error1; + xfs_warn(mp, "log size mismatch detected"); + return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_logdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (!error) { - xfs_buf_relse(bp); - } else { - cmn_err(CE_WARN, "XFS: size check 3 failed"); - if (error == ENOSPC) { - error = XFS_ERROR(E2BIG); - } - goto error1; + bp = xfs_buf_read_uncached(mp->m_logdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, NULL); + if (!bp) { + xfs_warn(mp, "log device read failed"); + return EIO; } + xfs_buf_relse(bp); } + return 0; +} + +/* + * Clear the quotaflags in memory and in the superblock. + */ +int +xfs_mount_reset_sbqflags( + struct xfs_mount *mp) +{ + int error; + struct xfs_trans *tp; + + mp->m_qflags = 0; /* - * Initialize realtime fields in the mount structure + * It is OK to look at sb_qflags here in mount path, + * without m_sb_lock. */ - if ((error = xfs_rtmount_init(mp))) { - cmn_err(CE_WARN, "XFS: RT mount failed"); - goto error1; - } + if (mp->m_sb.sb_qflags == 0) + return 0; + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = 0; + spin_unlock(&mp->m_sb_lock); /* - * For client case we are done now + * If the fs is readonly, let the incore superblock run + * with quotas off but don't flush the update out to disk */ - if (mfsi_flags & XFS_MFSI_CLIENT) { + if (mp->m_flags & XFS_MOUNT_RDONLY) return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_alert(mp, "%s: Superblock update failed!", __func__); + return error; } + xfs_mod_sb(tp, XFS_SB_QFLAGS); + return xfs_trans_commit(tp, 0); +} + +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + /* - * Copies the low order bits of the timestamp and the randomly - * set "sequence" number out of a UUID. + * We default to 5% or 8192 fsbs of space reserved, whichever is + * smaller. This is intended to cover concurrent allocation + * transactions when we initially hit enospc. These each require a 4 + * block reservation. Hence by default we cover roughly 2000 concurrent + * allocation reservations. */ - uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 8192); + return resblks; +} + +/* + * This function does the following on an initial mount of a file system: + * - reads the superblock from disk and init the mount struct + * - if we're a 32-bit kernel, do a size check on the superblock + * so we don't mount terabyte filesystems + * - init mount struct realtime fields + * - allocate inode hash table for fs + * - init directory manager + * - perform recovery and init the log manager + */ +int +xfs_mountfs( + xfs_mount_t *mp) +{ + xfs_sb_t *sbp = &(mp->m_sb); + xfs_inode_t *rip; + __uint64_t resblks; + uint quotamount = 0; + uint quotaflags = 0; + int error = 0; + + xfs_sb_mount_common(mp, sbp); /* - * The vfs structure needs to have a file system independent - * way of checking for the invariant file system ID. Since it - * can't look at mount structures it has a pointer to the data - * in the mount structure. + * Check for a mismatched features2 values. Older kernels + * read & wrote into the wrong sb offset for sb_features2 + * on some platforms due to xfs_sb_t not being 64bit size aligned + * when sb_features2 was added, which made older superblock + * reading/writing routines swap it as a 64-bit value. * - * File systems that don't support user level file handles (i.e. - * all of them except for XFS) will leave vfs_altfsid as NULL. + * For backwards compatibility, we make both slots equal. + * + * If we detect a mismatched field, we OR the set bits into the + * existing features2 field in case it has already been modified; we + * don't want to lose any features. We then update the bad location + * with the ORed value so that older kernels will see any features2 + * flags, and mark the two fields as needing updates once the + * transaction subsystem is online. */ - vfsp->vfs_altfsid = (xfs_fsid_t *)mp->m_fixedfsid; - mp->m_dmevmask = 0; /* not persistent; set after each mount */ + if (xfs_sb_has_mismatched_features2(sbp)) { + xfs_warn(mp, "correcting sb_features alignment problem"); + sbp->sb_features2 |= sbp->sb_bad_features2; + sbp->sb_bad_features2 = sbp->sb_features2; + mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; - xfs_dir_mount(mp); + /* + * Re-check for ATTR2 in case it was found in bad_features2 + * slot. + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + } + + if (xfs_sb_version_hasattr2(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_sb_version_removeattr2(&mp->m_sb); + mp->m_update_flags |= XFS_SB_FEATURES2; + + /* update sb_versionnum for the clearing of the morebits */ + if (!sbp->sb_features2) + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } + + /* always use v2 inodes by default now */ + if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { + mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_update_flags |= XFS_SB_VERSIONNUM; + } /* - * Initialize the attribute manager's entries. + * Check if sb_agblocks is aligned at stripe boundary + * If sb_agblocks is NOT aligned turn off m_dalign since + * allocator alignment is within an ag, therefore ag has + * to be aligned at stripe boundary. */ - mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100; + error = xfs_update_alignment(mp); + if (error) + goto out; + + xfs_alloc_compute_maxlevels(mp); + xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); + xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); + xfs_ialloc_compute_maxlevels(mp); + + xfs_set_maxicount(mp); + + error = xfs_uuid_mount(mp); + if (error) + goto out; /* - * Initialize the precomputed transaction reservations values. + * Set the minimum read and write sizes */ - xfs_trans_init(mp); + xfs_set_rw_sizes(mp); + + /* set the low space thresholds for dynamic preallocation */ + xfs_set_low_space_thresholds(mp); + + /* + * Set the inode cluster size. + * This may still be overridden by the file system + * block size if it is larger than the chosen cluster size. + * + * For v5 filesystems, scale the cluster size with the inode size to + * keep a constant ratio of inode per cluster buffer, but only if mkfs + * has set the inode alignment value appropriately for larger cluster + * sizes. + */ + mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int new_size = mp->m_inode_cluster_size; + + new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; + if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) + mp->m_inode_cluster_size = new_size; + } + + /* + * Set inode alignment fields + */ + xfs_set_inoalignment(mp); + + /* + * Check that the data (and log if separate) is an ok size. + */ + error = xfs_check_sizes(mp); + if (error) + goto out_remove_uuid; + + /* + * Initialize realtime fields in the mount structure + */ + error = xfs_rtmount_init(mp); + if (error) { + xfs_warn(mp, "RT mount failed"); + goto out_remove_uuid; + } + + /* + * Copies the low order bits of the timestamp and the randomly + * set "sequence" number out of a UUID. + */ + uuid_getnodeuniq(&sbp->sb_uuid, mp->m_fixedfsid); + + mp->m_dmevmask = 0; /* not persistent; set after each mount */ + + error = xfs_da_mount(mp); + if (error) { + xfs_warn(mp, "Failed dir/attr init: %d", error); + goto out_remove_uuid; + } /* - * Allocate and initialize the inode hash table for this - * file system. + * Initialize the precomputed transaction reservations values. */ - xfs_ihash_init(mp); - xfs_chash_init(mp); + xfs_trans_init(mp); /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); - mp->m_perag = - kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP); + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed per-ag init: %d", error); + goto out_free_dir; + } - mp->m_maxagi = xfs_initialize_perag(vfsp, mp, sbp->sb_agcount); + if (!sbp->sb_logblocks) { + xfs_warn(mp, "no log defined"); + XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); + error = XFS_ERROR(EFSCORRUPTED); + goto out_free_perag; + } /* * log's mount-time initialization. Perform 1st part recovery if needed */ - if (likely(sbp->sb_logblocks > 0)) { /* check for volume case */ - error = xfs_log_mount(mp, mp->m_logdev_targp, - XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), - XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); - if (error) { - cmn_err(CE_WARN, "XFS: log mount failed"); - goto error2; - } - } else { /* No log has been defined */ - cmn_err(CE_WARN, "XFS: no log defined"); - XFS_ERROR_REPORT("xfs_mountfs_int(1)", XFS_ERRLEVEL_LOW, mp); - error = XFS_ERROR(EFSCORRUPTED); - goto error2; + error = xfs_log_mount(mp, mp->m_logdev_targp, + XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), + XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); + if (error) { + xfs_warn(mp, "log mount failed"); + goto out_fail_wait; + } + + /* + * Now the log is mounted, we know if it was an unclean shutdown or + * not. If it was, with the first phase of recovery has completed, we + * have consistent AG blocks on disk. We have not recovered EFIs yet, + * but they are recovered transactionally in the second recovery phase + * later. + * + * Hence we can safely re-initialise incore superblock counters from + * the per-ag data. These may not be correct if the filesystem was not + * cleanly unmounted, so we need to wait for recovery to finish before + * doing this. + * + * If the filesystem was cleanly unmounted, then we can trust the + * values in the superblock to be correct and we don't need to do + * anything here. + * + * If we are currently making the filesystem, the initialisation will + * fail as the perag data is in an undefined state. + */ + if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + !XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + !mp->m_sb.sb_inprogress) { + error = xfs_initialize_perag_data(mp, sbp->sb_agcount); + if (error) + goto out_fail_wait; } /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. */ - error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); if (error) { - cmn_err(CE_WARN, "XFS: failed to read root inode"); - goto error3; + xfs_warn(mp, "failed to read root inode"); + goto out_log_dealloc; } ASSERT(rip != NULL); - rvp = XFS_ITOV(rip); - if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { - cmn_err(CE_WARN, "XFS: corrupted root inode"); - cmn_err(CE_WARN, "Device %s - root %llu is not a directory", - XFS_BUFTARG_NAME(mp->m_ddev_targp), + if (unlikely(!S_ISDIR(rip->i_d.di_mode))) { + xfs_warn(mp, "corrupted root inode %llu: not a directory", (unsigned long long)rip->i_ino); xfs_iunlock(rip, XFS_ILOCK_EXCL); XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, mp); error = XFS_ERROR(EFSCORRUPTED); - goto error4; + goto out_rele_rip; } mp->m_rootip = rip; /* save it */ @@ -1009,253 +886,255 @@ xfs_mountfs( /* * Initialize realtime inode pointers in the mount structure */ - if ((error = xfs_rtmount_inodes(mp))) { + error = xfs_rtmount_inodes(mp); + if (error) { /* * Free up the root inode. */ - cmn_err(CE_WARN, "XFS: failed to read RT inodes"); - goto error4; + xfs_warn(mp, "failed to read RT inodes"); + goto out_rele_rip; } /* - * If fs is not mounted readonly, then update the superblock - * unit and width changes. + * If this is a read-only mount defer the superblock updates until + * the next remount into writeable mode. Otherwise we would never + * perform the update e.g. for the root filesystem. */ - if (update_flags && !(vfsp->vfs_flag & VFS_RDONLY)) - xfs_mount_log_sbunit(mp, update_flags); + if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + goto out_rtunmount; + } + } /* * Initialise the XFS quota management subsystem for this mount */ - if ((error = XFS_QM_INIT(mp, "amount, "aflags))) - goto error4; + if (XFS_IS_QUOTA_RUNNING(mp)) { + error = xfs_qm_newmount(mp, "amount, "aflags); + if (error) + goto out_rtunmount; + } else { + ASSERT(!XFS_IS_QUOTA_ON(mp)); + + /* + * If a file system had quotas running earlier, but decided to + * mount without -o uquota/pquota/gquota options, revoke the + * quotachecked license. + */ + if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { + xfs_notice(mp, "resetting quota flags"); + error = xfs_mount_reset_sbqflags(mp); + if (error) + return error; + } + } /* * Finish recovering the file system. This part needed to be * delayed until after the root and real-time bitmap inodes * were consistently read in. */ - error = xfs_log_mount_finish(mp, mfsi_flags); + error = xfs_log_mount_finish(mp); if (error) { - cmn_err(CE_WARN, "XFS: log mount finish failed"); - goto error4; + xfs_warn(mp, "log mount finish failed"); + goto out_rtunmount; } /* * Complete the quota initialisation, post-log-replay component. */ - if ((error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags))) - goto error4; + if (quotamount) { + ASSERT(mp->m_qflags == 0); + mp->m_qflags = quotaflags; - return 0; + xfs_qm_mount_quotas(mp); + } - error4: /* - * Free up the root inode. + * Now we are mounted, reserve a small amount of unused space for + * privileged transactions. This is needed so that transaction + * space required for critical operations can dip into this pool + * when at ENOSPC. This is needed for operations like create with + * attr, unwritten extent conversion at ENOSPC, etc. Data allocations + * are not allowed to use this reserved space. + * + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. */ - VN_RELE(rvp); - error3: - xfs_log_unmount_dealloc(mp); - error2: - xfs_ihash_free(mp); - xfs_chash_free(mp); - for (agno = 0; agno < sbp->sb_agcount; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list, - sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS); - kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t)); - mp->m_perag = NULL; - /* FALLTHROUGH */ - error1: - if (uuid_mounted) - xfs_uuid_unmount(mp); - xfs_freesb(mp); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, + "Unable to allocate reserve blocks. Continuing without reserve pool."); + } + + return 0; + + out_rtunmount: + xfs_rtunmount_inodes(mp); + out_rele_rip: + IRELE(rip); + out_log_dealloc: + xfs_log_unmount(mp); + out_fail_wait: + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_wait_buftarg(mp->m_logdev_targp); + xfs_wait_buftarg(mp->m_ddev_targp); + out_free_perag: + xfs_free_perag(mp); + out_free_dir: + xfs_da_unmount(mp); + out_remove_uuid: + xfs_uuid_unmount(mp); + out: return error; } /* - * xfs_unmountfs - * * This flushes out the inodes,dquots and the superblock, unmounts the * log and makes sure that incore structures are freed. */ -int -xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) +void +xfs_unmountfs( + struct xfs_mount *mp) { - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); -#if defined(DEBUG) || defined(INDUCE_IO_ERROR) - int64_t fsid; -#endif + __uint64_t resblks; + int error; - xfs_iflush_all(mp); + cancel_delayed_work_sync(&mp->m_eofblocks_work); - XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING); + xfs_qm_unmount_quotas(mp); + xfs_rtunmount_inodes(mp); + IRELE(mp->m_rootip); /* - * Flush out the log synchronously so that we know for sure - * that nothing is pinned. This is important because bflush() - * will skip pinned buffers. + * We can potentially deadlock here if we have an inode cluster + * that has been freed has its buffer still pinned in memory because + * the transaction is still sitting in a iclog. The stale inodes + * on that buffer will have their flush locks held until the + * transaction hits the disk and the callbacks run. the inode + * flush takes the flush lock unconditionally and with nothing to + * push out the iclog we will never get that unlocked. hence we + * need to force the log first. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); - xfs_binval(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - xfs_binval(mp->m_rtdev_targp); - } - - xfs_unmountfs_writesb(mp); - - xfs_unmountfs_wait(mp); /* wait for async bufs */ - - xfs_log_unmount(mp); /* Done! No more fs ops. */ - - xfs_freesb(mp); + /* + * Flush all pending changes from the AIL. + */ + xfs_ail_push_all_sync(mp->m_ail); /* - * All inodes from this mount point should be freed. + * And reclaim all inodes. At this point there should be no dirty + * inodes and none should be pinned or locked, but use synchronous + * reclaim just to be sure. We can stop background inode reclaim + * here as well if it is still running. */ - ASSERT(mp->m_inodes == NULL); + cancel_delayed_work_sync(&mp->m_reclaim_work); + xfs_reclaim_inodes(mp, SYNC_WAIT); - xfs_unmountfs_close(mp, cr); - if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) - xfs_uuid_unmount(mp); + xfs_qm_unmount(mp); -#if defined(DEBUG) || defined(INDUCE_IO_ERROR) /* - * clear all error tags on this filesystem + * Unreserve any blocks we have so that when we unmount we don't account + * the reserved free space as used. This is really only necessary for + * lazy superblock counting because it trusts the incore superblock + * counters to be absolutely correct on clean unmount. + * + * We don't bother correcting this elsewhere for lazy superblock + * counting because on mount of an unclean filesystem we reconstruct the + * correct counter value and this is irrelevant. + * + * For non-lazy counter filesystems, this doesn't matter at all because + * we only every apply deltas to the superblock and hence the incore + * value does not matter.... */ - memcpy(&fsid, &vfsp->vfs_fsid, sizeof(int64_t)); - xfs_errortag_clearall_umount(fsid, mp->m_fsname, 0); + resblks = 0; + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + xfs_warn(mp, "Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); + + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "Unable to update superblock counters. " + "Freespace may not be correct on next mount."); + + xfs_log_unmount(mp); + xfs_da_unmount(mp); + xfs_uuid_unmount(mp); + +#if defined(DEBUG) + xfs_errortag_clearall(mp, 0); #endif - XFS_IODONE(vfsp); - xfs_mount_free(mp, 1); - return 0; -} - -void -xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_free_buftarg(mp->m_logdev_targp, 1); - if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp, 1); - xfs_free_buftarg(mp->m_ddev_targp, 0); -} - -STATIC void -xfs_unmountfs_wait(xfs_mount_t *mp) -{ - if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_wait_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) - xfs_wait_buftarg(mp->m_rtdev_targp); - xfs_wait_buftarg(mp->m_ddev_targp); + xfs_free_perag(mp); } int -xfs_unmountfs_writesb(xfs_mount_t *mp) +xfs_fs_writable(xfs_mount_t *mp) { - xfs_buf_t *sbp; - xfs_sb_t *sb; - int error = 0; - - /* - * skip superblock write if fs is read-only, or - * if we are doing a forced umount. - */ - sbp = xfs_getsb(mp, 0); - if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || - XFS_FORCED_SHUTDOWN(mp))) { - - xfs_icsb_sync_counters(mp); - - /* - * mark shared-readonly if desired - */ - sb = XFS_BUF_TO_SBP(sbp); - if (mp->m_mk_sharedro) { - if (!(sb->sb_flags & XFS_SBF_READONLY)) - sb->sb_flags |= XFS_SBF_READONLY; - if (!XFS_SB_VERSION_HASSHARED(sb)) - XFS_SB_VERSION_ADDSHARED(sb); - xfs_fs_cmn_err(CE_NOTE, mp, - "Unmounting, marking shared read-only"); - } - XFS_BUF_UNDONE(sbp); - XFS_BUF_UNREAD(sbp); - XFS_BUF_UNDELAYWRITE(sbp); - XFS_BUF_WRITE(sbp); - XFS_BUF_UNASYNC(sbp); - ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); - xfsbdstrat(mp, sbp); - /* Nevermind errors we might get here. */ - error = xfs_iowait(sbp); - if (error) - xfs_ioerror_alert("xfs_unmountfs_writesb", - mp, sbp, XFS_BUF_ADDR(sbp)); - if (error && mp->m_mk_sharedro) - xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly"); - } - xfs_buf_relse(sbp); - return error; + return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) || + (mp->m_flags & XFS_MOUNT_RDONLY)); } /* - * xfs_mod_sb() can be used to copy arbitrary changes to the - * in-core superblock into the superblock buffer to be logged. - * It does not provide the higher level of locking that is - * needed to protect the in-core superblock from concurrent - * access. + * xfs_log_sbcount + * + * Sync the superblock counters to disk. + * + * Note this code can be called during the process of freezing, so + * we may need to use the transaction allocator which does not + * block when the transaction subsystem is in its frozen state. */ -void -xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +int +xfs_log_sbcount(xfs_mount_t *mp) { - xfs_buf_t *bp; - int first; - int last; - xfs_mount_t *mp; - xfs_sb_t *sbp; - xfs_sb_field_t f; - - ASSERT(fields); - if (!fields) - return; - mp = tp->t_mountp; - bp = xfs_trans_getsb(tp, mp, 0); - sbp = XFS_BUF_TO_SBP(bp); - first = sizeof(xfs_sb_t); - last = 0; - - /* translate/copy */ + xfs_trans_t *tp; + int error; - xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), -1, fields); + if (!xfs_fs_writable(mp)) + return 0; - /* find modified range */ + xfs_icsb_sync_counters(mp, 0); - f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - first = xfs_sb_info[f].offset; + /* + * we don't need to do this if we are updating the superblock + * counters on every modification. + */ + if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + return 0; - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; + tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } - xfs_trans_log_buf(tp, bp, first, last); + xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return error; } - /* - * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply + * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply * a delta to a specified field in the in-core superblock. Simply * switch on the field indicated and apply the delta to that field. * Fields are not allowed to dip below zero, so if the delta would * do this do not apply it and return EINVAL. * - * The SB_LOCK must be held when this routine is called. + * The m_sb_lock must be held when this routine is called. */ -int -xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, - int delta, int rsvd) +STATIC int +xfs_mod_incore_sb_unlocked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { int scounter; /* short counter for 32 bit fields */ long long lcounter; /* long counter for 64 bit fields */ @@ -1287,7 +1166,6 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, mp->m_sb.sb_ifree = lcounter; return 0; case XFS_SBS_FDBLOCKS: - lcounter = (long long) mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); @@ -1301,26 +1179,30 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, lcounter += rem; } } else { /* Taking blocks away */ - lcounter += delta; + if (lcounter >= 0) { + mp->m_sb.sb_fdblocks = lcounter + + XFS_ALLOC_SET_ASIDE(mp); + return 0; + } - /* - * If were out of blocks, use any available reserved blocks if - * were allowed to. - */ + /* + * We are out of blocks, use any available reserved + * blocks if were allowed to. + */ + if (!rsvd) + return XFS_ERROR(ENOSPC); - if (lcounter < 0) { - if (rsvd) { - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter < 0) { - return XFS_ERROR(ENOSPC); - } - mp->m_resblks_avail = lcounter; - return 0; - } else { /* not reserved */ - return XFS_ERROR(ENOSPC); - } + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); + return XFS_ERROR(ENOSPC); } mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); @@ -1414,135 +1296,77 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, /* * xfs_mod_incore_sb() is used to change a field in the in-core * superblock structure by the specified delta. This modification - * is protected by the SB_LOCK. Just use the xfs_mod_incore_sb_unlocked() + * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() * routine to do the work. */ int -xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) +xfs_mod_incore_sb( + struct xfs_mount *mp, + xfs_sb_field_t field, + int64_t delta, + int rsvd) { - unsigned long s; - int status; + int status; - /* check for per-cpu counters */ - switch (field) { #ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = xfs_icsb_modify_counters(mp, field, - delta, rsvd); - break; - } - /* FALLTHROUGH */ + ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); #endif - default: - s = XFS_SB_LOCK(mp); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - XFS_SB_UNLOCK(mp, s); - break; - } + spin_lock(&mp->m_sb_lock); + status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); return status; } /* - * xfs_mod_incore_sb_batch() is used to change more than one field - * in the in-core superblock structure at a time. This modification - * is protected by a lock internal to this module. The fields and - * changes to those fields are specified in the array of xfs_mod_sb - * structures passed in. + * Change more than one field in the in-core superblock structure at a time. + * + * The fields and changes to those fields are specified in the array of + * xfs_mod_sb structures passed in. Either all of the specified deltas + * will be applied or none of them will. If any modified field dips below 0, + * then all modifications will be backed out and EINVAL will be returned. * - * Either all of the specified deltas will be applied or none of - * them will. If any modified field dips below 0, then all modifications - * will be backed out and EINVAL will be returned. + * Note that this function may not be used for the superblock values that + * are tracked with the in-memory per-cpu counters - a direct call to + * xfs_icsb_modify_counters is required for these. */ int -xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) +xfs_mod_incore_sb_batch( + struct xfs_mount *mp, + xfs_mod_sb_t *msb, + uint nmsb, + int rsvd) { - unsigned long s; - int status=0; - xfs_mod_sb_t *msbp; + xfs_mod_sb_t *msbp; + int error = 0; /* - * Loop through the array of mod structures and apply each - * individually. If any fail, then back out all those - * which have already been applied. Do all of this within - * the scope of the SB_LOCK so that all of the changes will - * be atomic. + * Loop through the array of mod structures and apply each individually. + * If any fail, then back out all those which have already been applied. + * Do all of this within the scope of the m_sb_lock so that all of the + * changes will be atomic. */ - s = XFS_SB_LOCK(mp); - msbp = &msb[0]; - for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { - /* - * Apply the delta at index n. If it fails, break - * from the loop so we'll fall into the undo loop - * below. - */ - switch (msbp->msb_field) { -#ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = xfs_icsb_modify_counters_locked(mp, - msbp->msb_field, - msbp->msb_delta, rsvd); - break; - } - /* FALLTHROUGH */ -#endif - default: - status = xfs_mod_incore_sb_unlocked(mp, - msbp->msb_field, - msbp->msb_delta, rsvd); - break; - } + spin_lock(&mp->m_sb_lock); + for (msbp = msb; msbp < (msb + nmsb); msbp++) { + ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || + msbp->msb_field > XFS_SBS_FDBLOCKS); - if (status != 0) { - break; - } + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + msbp->msb_delta, rsvd); + if (error) + goto unwind; } + spin_unlock(&mp->m_sb_lock); + return 0; - /* - * If we didn't complete the loop above, then back out - * any changes made to the superblock. If you add code - * between the loop above and here, make sure that you - * preserve the value of status. Loop back until - * we step below the beginning of the array. Make sure - * we don't touch anything back there. - */ - if (status != 0) { - msbp--; - while (msbp >= msb) { - switch (msbp->msb_field) { -#ifdef HAVE_PERCPU_SB - case XFS_SBS_ICOUNT: - case XFS_SBS_IFREE: - case XFS_SBS_FDBLOCKS: - if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { - status = - xfs_icsb_modify_counters_locked(mp, - msbp->msb_field, - -(msbp->msb_delta), - rsvd); - break; - } - /* FALLTHROUGH */ -#endif - default: - status = xfs_mod_incore_sb_unlocked(mp, - msbp->msb_field, - -(msbp->msb_delta), - rsvd); - break; - } - ASSERT(status == 0); - msbp--; - } +unwind: + while (--msbp >= msb) { + error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, + -msbp->msb_delta, rsvd); + ASSERT(error == 0); } - XFS_SB_UNLOCK(mp, s); - return status; + spin_unlock(&mp->m_sb_lock); + return error; } /* @@ -1554,23 +1378,20 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd) * the superblock buffer if it can be locked without sleeping. * If it can't then we'll return NULL. */ -xfs_buf_t * +struct xfs_buf * xfs_getsb( - xfs_mount_t *mp, - int flags) + struct xfs_mount *mp, + int flags) { - xfs_buf_t *bp; + struct xfs_buf *bp = mp->m_sb_bp; - ASSERT(mp->m_sb_bp != NULL); - bp = mp->m_sb_bp; - if (flags & XFS_BUF_TRYLOCK) { - if (!XFS_BUF_CPSEMA(bp)) { + if (!xfs_buf_trylock(bp)) { + if (flags & XBF_TRYLOCK) return NULL; - } - } else { - XFS_BUF_PSEMA(bp, PRIBIO); + xfs_buf_lock(bp); } - XFS_BUF_HOLD(bp); + + xfs_buf_hold(bp); ASSERT(XFS_BUF_ISDONE(bp)); return bp; } @@ -1580,76 +1401,61 @@ xfs_getsb( */ void xfs_freesb( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_buf_t *bp; + struct xfs_buf *bp = mp->m_sb_bp; - /* - * Use xfs_getsb() so that the buffer will be locked - * when we call xfs_buf_relse(). - */ - bp = xfs_getsb(mp, 0); - XFS_BUF_UNMANAGE(bp); - xfs_buf_relse(bp); + xfs_buf_lock(bp); mp->m_sb_bp = NULL; -} - -/* - * See if the UUID is unique among mounted XFS filesystems. - * Mount fails if UUID is nil or a FS with the same UUID is already mounted. - */ -STATIC int -xfs_uuid_mount( - xfs_mount_t *mp) -{ - if (uuid_is_nil(&mp->m_sb.sb_uuid)) { - cmn_err(CE_WARN, - "XFS: Filesystem %s has nil UUID - can't mount", - mp->m_fsname); - return -1; - } - if (!uuid_table_insert(&mp->m_sb.sb_uuid)) { - cmn_err(CE_WARN, - "XFS: Filesystem %s has duplicate UUID - can't mount", - mp->m_fsname); - return -1; - } - return 0; -} - -/* - * Remove filesystem from the UUID table. - */ -STATIC void -xfs_uuid_unmount( - xfs_mount_t *mp) -{ - uuid_table_remove(&mp->m_sb.sb_uuid); + xfs_buf_relse(bp); } /* * Used to log changes to the superblock unit and width fields which could - * be altered by the mount options. Only the first superblock is updated. + * be altered by the mount options, as well as any potential sb_features2 + * fixup. Only the first superblock is updated. */ -STATIC void -xfs_mount_log_sbunit( +int +xfs_mount_log_sb( xfs_mount_t *mp, __int64_t fields) { xfs_trans_t *tp; + int error; - ASSERT(fields & (XFS_SB_UNIT|XFS_SB_WIDTH|XFS_SB_UUID)); + ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | + XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 | + XFS_SB_VERSIONNUM)); tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); - if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT)) { + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); + if (error) { xfs_trans_cancel(tp, 0); - return; + return error; } xfs_mod_sb(tp, fields); - xfs_trans_commit(tp, 0, NULL); + error = xfs_trans_commit(tp, 0); + return error; } +/* + * If the underlying (data/log/rt) device is readonly, there are some + * operations that cannot proceed. + */ +int +xfs_dev_is_read_only( + struct xfs_mount *mp, + char *message) +{ + if (xfs_readonly_buftarg(mp->m_ddev_targp) || + xfs_readonly_buftarg(mp->m_logdev_targp) || + (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { + xfs_notice(mp, "%s required on read-only device.", message); + xfs_notice(mp, "write access unavailable, cannot proceed."); + return EROFS; + } + return 0; +} #ifdef HAVE_PERCPU_SB /* @@ -1674,7 +1480,7 @@ xfs_mount_log_sbunit( * we disable the per-cpu counter and go through the slow path. * * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain it's resources back to + * when we disable a per-cpu counter, we need to drain its resources back to * the global superblock. We do this after disabling the counter to prevent * more threads from queueing up on the counter. * @@ -1689,12 +1495,12 @@ xfs_mount_log_sbunit( * * Locking rules: * - * 1. XFS_SB_LOCK() before picking up per-cpu locks + * 1. m_sb_lock before picking up per-cpu locks * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires XFS_SB_LOCK + per cpu locks + * 3. accurate counter sync requires m_sb_lock + per cpu locks * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding XFS_SB_LOCK - * 6. enabling or disabling a counter requires holding the XFS_SB_LOCK + * 5. modifying global counters requires holding m_sb_lock + * 6. enabling or disabling a counter requires holding the m_sb_lock * and _none_ of the per-cpu locks. * * Disabled counters are only ever re-enabled by a balance operation @@ -1721,27 +1527,32 @@ xfs_icsb_cpu_notify( { xfs_icsb_cnts_t *cntp; xfs_mount_t *mp; - int s; mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); cntp = (xfs_icsb_cnts_t *) per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: /* Easy Case - initialize the area and locks, and * then rebalance when online does everything else for us. */ memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + xfs_icsb_lock(mp); xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); break; case CPU_DEAD: + case CPU_DEAD_FROZEN: /* Disable all the counters, then fold the dead cpu's * count into the total on the global superblock and * re-enable the counters. */ - s = XFS_SB_LOCK(mp); + xfs_icsb_lock(mp); + spin_lock(&mp->m_sb_lock); xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); @@ -1752,10 +1563,11 @@ xfs_icsb_cpu_notify( memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, XFS_ICSB_SB_LOCKED); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, XFS_ICSB_SB_LOCKED); - XFS_SB_UNLOCK(mp, s); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); + spin_unlock(&mp->m_sb_lock); + xfs_icsb_unlock(mp); break; } @@ -1774,25 +1586,45 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); + } + + mutex_init(&mp->m_icsb_mutex); + + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + #ifdef CONFIG_HOTPLUG_CPU mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; mp->m_icsb_notifier.priority = 0; register_hotcpu_notifier(&mp->m_icsb_notifier); #endif /* CONFIG_HOTPLUG_CPU */ - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - } + return 0; +} + +void +xfs_icsb_reinit_counters( + xfs_mount_t *mp) +{ + xfs_icsb_lock(mp); /* * start with all counters disabled so that the * initial balance kicks us off correctly */ mp->m_icsb_counters = -1; - return 0; + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + xfs_icsb_unlock(mp); } -STATIC void +void xfs_icsb_destroy_counters( xfs_mount_t *mp) { @@ -1800,9 +1632,10 @@ xfs_icsb_destroy_counters( unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } + mutex_destroy(&mp->m_icsb_mutex); } -STATIC inline void +STATIC void xfs_icsb_lock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1811,7 +1644,7 @@ xfs_icsb_lock_cntr( } } -STATIC inline void +STATIC void xfs_icsb_unlock_cntr( xfs_icsb_cnts_t *icsbp) { @@ -1819,7 +1652,7 @@ xfs_icsb_unlock_cntr( } -STATIC inline void +STATIC void xfs_icsb_lock_all_counters( xfs_mount_t *mp) { @@ -1832,7 +1665,7 @@ xfs_icsb_lock_all_counters( } } -STATIC inline void +STATIC void xfs_icsb_unlock_all_counters( xfs_mount_t *mp) { @@ -1879,7 +1712,7 @@ xfs_icsb_counter_disabled( return test_bit(field, &mp->m_icsb_counters); } -STATIC int +STATIC void xfs_icsb_disable_counter( xfs_mount_t *mp, xfs_sb_field_t field) @@ -1888,11 +1721,22 @@ xfs_icsb_disable_counter( ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + /* + * If we are already disabled, then there is nothing to do + * here. We check before locking all the counters to avoid + * the expensive lock operation when being called in the + * slow path and the counter is already disabled. This is + * safe because the only time we set or clear this state is under + * the m_icsb_mutex. + */ + if (xfs_icsb_counter_disabled(mp, field)) + return; + xfs_icsb_lock_all_counters(mp); if (!test_and_set_bit(field, &mp->m_icsb_counters)) { /* drain back to superblock */ - xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT); + xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); switch(field) { case XFS_SBS_ICOUNT: mp->m_sb.sb_icount = cnt.icsb_icount; @@ -1909,8 +1753,6 @@ xfs_icsb_disable_counter( } xfs_icsb_unlock_all_counters(mp); - - return 0; } STATIC void @@ -1948,76 +1790,64 @@ xfs_icsb_enable_counter( xfs_icsb_unlock_all_counters(mp); } -STATIC void -xfs_icsb_sync_counters_int( +void +xfs_icsb_sync_counters_locked( xfs_mount_t *mp, int flags) { xfs_icsb_cnts_t cnt; - int s; - - /* Pass 1: lock all counters */ - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - s = XFS_SB_LOCK(mp); xfs_icsb_count(mp, &cnt, flags); - /* Step 3: update mp->m_sb fields */ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) mp->m_sb.sb_icount = cnt.icsb_icount; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) mp->m_sb.sb_ifree = cnt.icsb_ifree; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - XFS_SB_UNLOCK(mp, s); } /* * Accurate update of per-cpu counters to incore superblock */ -STATIC void -xfs_icsb_sync_counters( - xfs_mount_t *mp) -{ - xfs_icsb_sync_counters_int(mp, 0); -} - -/* - * lazy addition used for things like df, background sb syncs, etc - */ void -xfs_icsb_sync_counters_lazy( - xfs_mount_t *mp) +xfs_icsb_sync_counters( + xfs_mount_t *mp, + int flags) { - xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT); + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, flags); + spin_unlock(&mp->m_sb_lock); } /* * Balance and enable/disable counters as necessary. * - * Thresholds for re-enabling counters are somewhat magic. - * inode counts are chosen to be the same number as single - * on disk allocation chunk per CPU, and free blocks is - * something far enough zero that we aren't going thrash - * when we get near ENOSPC. + * Thresholds for re-enabling counters are somewhat magic. inode counts are + * chosen to be the same number as single on disk allocation chunk per CPU, and + * free blocks is something far enough zero that we aren't going thrash when we + * get near ENOSPC. We also need to supply a minimum we require per cpu to + * prevent looping endlessly when xfs_alloc_space asks for more than will + * be distributed to a single CPU but each CPU has enough blocks to be + * reenabled. + * + * Note that we can be called when counters are already disabled. + * xfs_icsb_disable_counter() optimises the counter locking in this case to + * prevent locking every per-cpu counter needlessly. */ -#define XFS_ICSB_INO_CNTR_REENABLE 64 + +#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (512 + XFS_ALLOC_SET_ASIDE(mp)) + (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) STATIC void -xfs_icsb_balance_counter( +xfs_icsb_balance_counter_locked( xfs_mount_t *mp, xfs_sb_field_t field, - int flags) + int min_per_cpu) { uint64_t count, resid; int weight = num_online_cpus(); - int s; - - if (!(flags & XFS_ICSB_SB_LOCKED)) - s = XFS_SB_LOCK(mp); + uint64_t min = (uint64_t)min_per_cpu; /* disable counter and sync counter */ xfs_icsb_disable_counter(mp, field); @@ -2027,20 +1857,20 @@ xfs_icsb_balance_counter( case XFS_SBS_ICOUNT: count = mp->m_sb.sb_icount; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) - goto out; + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; break; case XFS_SBS_IFREE: count = mp->m_sb.sb_ifree; resid = do_div(count, weight); - if (count < XFS_ICSB_INO_CNTR_REENABLE) - goto out; + if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) + return; break; case XFS_SBS_FDBLOCKS: count = mp->m_sb.sb_fdblocks; resid = do_div(count, weight); - if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp)) - goto out; + if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) + return; break; default: BUG(); @@ -2049,37 +1879,52 @@ xfs_icsb_balance_counter( } xfs_icsb_enable_counter(mp, field, count, resid); -out: - if (!(flags & XFS_ICSB_SB_LOCKED)) - XFS_SB_UNLOCK(mp, s); } -STATIC int -xfs_icsb_modify_counters_int( +STATIC void +xfs_icsb_balance_counter( + xfs_mount_t *mp, + xfs_sb_field_t fields, + int min_per_cpu) +{ + spin_lock(&mp->m_sb_lock); + xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); + spin_unlock(&mp->m_sb_lock); +} + +int +xfs_icsb_modify_counters( xfs_mount_t *mp, xfs_sb_field_t field, - int delta, - int rsvd, - int flags) + int64_t delta, + int rsvd) { xfs_icsb_cnts_t *icsbp; long long lcounter; /* long counter for 64 bit fields */ - int cpu, s, locked = 0; - int ret = 0, balance_done = 0; + int ret = 0; + might_sleep(); again: - cpu = get_cpu(); - icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu), - xfs_icsb_lock_cntr(icsbp); + preempt_disable(); + icsbp = this_cpu_ptr(mp->m_sb_cnts); + + /* + * if the counter is disabled, go to slow path + */ if (unlikely(xfs_icsb_counter_disabled(mp, field))) goto slow_path; + xfs_icsb_lock_cntr(icsbp); + if (unlikely(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock_cntr(icsbp); + goto slow_path; + } switch (field) { case XFS_SBS_ICOUNT: lcounter = icsbp->icsb_icount; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_icount = lcounter; break; @@ -2087,7 +1932,7 @@ again: lcounter = icsbp->icsb_ifree; lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_ifree = lcounter; break; @@ -2097,7 +1942,7 @@ again: lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); lcounter += delta; if (unlikely(lcounter < 0)) - goto slow_path; + goto balance_counter; icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); break; default: @@ -2105,73 +1950,79 @@ again: break; } xfs_icsb_unlock_cntr(icsbp); - put_cpu(); - if (locked) - XFS_SB_UNLOCK(mp, s); + preempt_enable(); return 0; +slow_path: + preempt_enable(); + /* - * The slow path needs to be run with the SBLOCK - * held so that we prevent other threads from - * attempting to run this path at the same time. - * this provides exclusion for the balancing code, - * and exclusive fallback if the balance does not - * provide enough resources to continue in an unlocked - * manner. + * serialise with a mutex so we don't burn lots of cpu on + * the superblock lock. We still need to hold the superblock + * lock, however, when we modify the global structures. */ -slow_path: - xfs_icsb_unlock_cntr(icsbp); - put_cpu(); - - /* need to hold superblock incase we need - * to disable a counter */ - if (!(flags & XFS_ICSB_SB_LOCKED)) { - s = XFS_SB_LOCK(mp); - locked = 1; - flags |= XFS_ICSB_SB_LOCKED; - } - if (!balance_done) { - xfs_icsb_balance_counter(mp, field, flags); - balance_done = 1; + xfs_icsb_lock(mp); + + /* + * Now running atomically. + * + * If the counter is enabled, someone has beaten us to rebalancing. + * Drop the lock and try again in the fast path.... + */ + if (!(xfs_icsb_counter_disabled(mp, field))) { + xfs_icsb_unlock(mp); goto again; - } else { - /* - * we might not have enough on this local - * cpu to allocate for a bulk request. - * We need to drain this field from all CPUs - * and disable the counter fastpath - */ - xfs_icsb_disable_counter(mp, field); } + /* + * The counter is currently disabled. Because we are + * running atomically here, we know a rebalance cannot + * be in progress. Hence we can go straight to operating + * on the global superblock. We do not call xfs_mod_incore_sb() + * here even though we need to get the m_sb_lock. Doing so + * will cause us to re-enter this function and deadlock. + * Hence we get the m_sb_lock ourselves and then call + * xfs_mod_incore_sb_unlocked() as the unlocked path operates + * directly on the global counters. + */ + spin_lock(&mp->m_sb_lock); ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + spin_unlock(&mp->m_sb_lock); - if (locked) - XFS_SB_UNLOCK(mp, s); + /* + * Now that we've modified the global superblock, we + * may be able to re-enable the distributed counters + * (e.g. lots of space just got freed). After that + * we are done. + */ + if (ret != ENOSPC) + xfs_icsb_balance_counter(mp, field, 0); + xfs_icsb_unlock(mp); return ret; -} -STATIC int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0); -} +balance_counter: + xfs_icsb_unlock_cntr(icsbp); + preempt_enable(); -/* - * Called when superblock is already locked - */ -STATIC int -xfs_icsb_modify_counters_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int delta, - int rsvd) -{ - return xfs_icsb_modify_counters_int(mp, field, delta, - rsvd, XFS_ICSB_SB_LOCKED); + /* + * We may have multiple threads here if multiple per-cpu + * counters run dry at the same time. This will mean we can + * do more balances than strictly necessary but it is not + * the common slowpath case. + */ + xfs_icsb_lock(mp); + + /* + * running atomically. + * + * This will leave the counter in the correct state for future + * accesses. After the rebalance, we simply try again and our retry + * will either succeed through the fast path or slow path without + * another balance operation being required. + */ + xfs_icsb_balance_counter(mp, field, delta); + xfs_icsb_unlock(mp); + goto again; } + #endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b2bd4be4200..7295a0b7c34 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,271 +18,16 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -typedef struct xfs_trans_reservations { - uint tr_write; /* extent alloc trans */ - uint tr_itruncate; /* truncate trans */ - uint tr_rename; /* rename trans */ - uint tr_link; /* link trans */ - uint tr_remove; /* unlink trans */ - uint tr_symlink; /* symlink trans */ - uint tr_create; /* create trans */ - uint tr_mkdir; /* mkdir trans */ - uint tr_ifree; /* inode free trans */ - uint tr_ichange; /* inode update trans */ - uint tr_growdata; /* fs data section grow trans */ - uint tr_swrite; /* sync write inode trans */ - uint tr_addafork; /* cvt inode to attributed trans */ - uint tr_writeid; /* write setuid/setgid file */ - uint tr_attrinval; /* attr fork buffer invalidation */ - uint tr_attrset; /* set/create an attribute */ - uint tr_attrrm; /* remove an attribute */ - uint tr_clearagi; /* clear bad agi unlinked ino bucket */ - uint tr_growrtalloc; /* grow realtime allocations */ - uint tr_growrtzero; /* grow realtime zeroing */ - uint tr_growrtfree; /* grow realtime freeing */ -} xfs_trans_reservations_t; - -#ifndef __KERNEL__ -/* - * Moved here from xfs_ag.h to avoid reordering header files - */ -#define XFS_DADDR_TO_AGNO(mp,d) \ - ((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks)) -#define XFS_DADDR_TO_AGBNO(mp,d) \ - ((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks)) -#else -struct cred; -struct log; -struct bhv_vfs; -struct bhv_vnode; -struct xfs_mount_args; -struct xfs_ihash; -struct xfs_chash; -struct xfs_inode; -struct xfs_perag; -struct xfs_iocore; -struct xfs_bmbt_irec; -struct xfs_bmap_free; -struct xfs_extdelta; -struct xfs_swapext; - -extern struct bhv_vfsops xfs_vfsops; -extern struct bhv_vnodeops xfs_vnodeops; - -#define AIL_LOCK_T lock_t -#define AIL_LOCKINIT(x,y) spinlock_init(x,y) -#define AIL_LOCK_DESTROY(x) spinlock_destroy(x) -#define AIL_LOCK(mp,s) s=mutex_spinlock(&(mp)->m_ail_lock) -#define AIL_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_ail_lock, s) - - -/* - * Prototypes and functions for the Data Migration subsystem. - */ - -typedef int (*xfs_send_data_t)(int, struct bhv_vnode *, - xfs_off_t, size_t, int, bhv_vrwlock_t *); -typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); -typedef int (*xfs_send_destroy_t)(struct bhv_vnode *, dm_right_t); -typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct bhv_vfs *, - struct bhv_vnode *, - dm_right_t, struct bhv_vnode *, dm_right_t, - char *, char *, mode_t, int, int); -typedef void (*xfs_send_unmount_t)(struct bhv_vfs *, struct bhv_vnode *, - dm_right_t, mode_t, int, int); - -typedef struct xfs_dmops { - xfs_send_data_t xfs_send_data; - xfs_send_mmap_t xfs_send_mmap; - xfs_send_destroy_t xfs_send_destroy; - xfs_send_namesp_t xfs_send_namesp; - xfs_send_unmount_t xfs_send_unmount; -} xfs_dmops_t; - -#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \ - (*(mp)->m_dm_ops.xfs_send_data)(ev,vp,off,len,fl,lock) -#define XFS_SEND_MMAP(mp, vma,fl) \ - (*(mp)->m_dm_ops.xfs_send_mmap)(vma,fl) -#define XFS_SEND_DESTROY(mp, vp,right) \ - (*(mp)->m_dm_ops.xfs_send_destroy)(vp,right) -#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_PREUNMOUNT(mp, vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_namesp)(DM_EVENT_PREUNMOUNT,vfs,b1,r1,b2,r2,n1,n2,mode,rval,fl) -#define XFS_SEND_UNMOUNT(mp, vfsp,vp,right,mode,rval,fl) \ - (*(mp)->m_dm_ops.xfs_send_unmount)(vfsp,vp,right,mode,rval,fl) - - -/* - * Prototypes and functions for the Quota Management subsystem. - */ +#ifdef __KERNEL__ -struct xfs_dquot; -struct xfs_dqtrxops; +struct xlog; +struct xfs_inode; +struct xfs_mru_cache; +struct xfs_nameops; +struct xfs_ail; struct xfs_quotainfo; - -typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *); -typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int); -typedef int (*xfs_qmunmount_t)(struct xfs_mount *); -typedef void (*xfs_qmdone_t)(struct xfs_mount *); -typedef void (*xfs_dqrele_t)(struct xfs_dquot *); -typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint); -typedef void (*xfs_dqdetach_t)(struct xfs_inode *); -typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint); -typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *, - struct xfs_inode *, uid_t, gid_t, prid_t, uint, - struct xfs_dquot **, struct xfs_dquot **); -typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *); -typedef int (*xfs_dqvoprename_t)(struct xfs_inode **); -typedef struct xfs_dquot * (*xfs_dqvopchown_t)( - struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot **, struct xfs_dquot *); -typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *, - struct xfs_dquot *, struct xfs_dquot *, uint); - -typedef struct xfs_qmops { - xfs_qminit_t xfs_qminit; - xfs_qmdone_t xfs_qmdone; - xfs_qmmount_t xfs_qmmount; - xfs_qmunmount_t xfs_qmunmount; - xfs_dqrele_t xfs_dqrele; - xfs_dqattach_t xfs_dqattach; - xfs_dqdetach_t xfs_dqdetach; - xfs_dqpurgeall_t xfs_dqpurgeall; - xfs_dqvopalloc_t xfs_dqvopalloc; - xfs_dqvopcreate_t xfs_dqvopcreate; - xfs_dqvoprename_t xfs_dqvoprename; - xfs_dqvopchown_t xfs_dqvopchown; - xfs_dqvopchownresv_t xfs_dqvopchownresv; - struct xfs_dqtrxops *xfs_dqtrxops; -} xfs_qmops_t; - -#define XFS_QM_INIT(mp, mnt, fl) \ - (*(mp)->m_qm_ops.xfs_qminit)(mp, mnt, fl) -#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \ - (*(mp)->m_qm_ops.xfs_qmmount)(mp, mnt, fl, mfsi_flags) -#define XFS_QM_UNMOUNT(mp) \ - (*(mp)->m_qm_ops.xfs_qmunmount)(mp) -#define XFS_QM_DONE(mp) \ - (*(mp)->m_qm_ops.xfs_qmdone)(mp) -#define XFS_QM_DQRELE(mp, dq) \ - (*(mp)->m_qm_ops.xfs_dqrele)(dq) -#define XFS_QM_DQATTACH(mp, ip, fl) \ - (*(mp)->m_qm_ops.xfs_dqattach)(ip, fl) -#define XFS_QM_DQDETACH(mp, ip) \ - (*(mp)->m_qm_ops.xfs_dqdetach)(ip) -#define XFS_QM_DQPURGEALL(mp, fl) \ - (*(mp)->m_qm_ops.xfs_dqpurgeall)(mp, fl) -#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \ - (*(mp)->m_qm_ops.xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2) -#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \ - (*(mp)->m_qm_ops.xfs_dqvopcreate)(tp, ip, dq1, dq2) -#define XFS_QM_DQVOPRENAME(mp, ip) \ - (*(mp)->m_qm_ops.xfs_dqvoprename)(ip) -#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \ - (*(mp)->m_qm_ops.xfs_dqvopchown)(tp, ip, dqp, dq) -#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \ - (*(mp)->m_qm_ops.xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl) - - -/* - * Prototypes and functions for I/O core modularization. - */ - -typedef int (*xfs_ioinit_t)(struct bhv_vfs *, - struct xfs_mount_args *, int); -typedef int (*xfs_bmapi_t)(struct xfs_trans *, void *, - xfs_fileoff_t, xfs_filblks_t, int, - xfs_fsblock_t *, xfs_extlen_t, - struct xfs_bmbt_irec *, int *, - struct xfs_bmap_free *, struct xfs_extdelta *); -typedef int (*xfs_bunmapi_t)(struct xfs_trans *, - void *, xfs_fileoff_t, - xfs_filblks_t, int, xfs_extnum_t, - xfs_fsblock_t *, struct xfs_bmap_free *, - struct xfs_extdelta *, int *); -typedef int (*xfs_bmap_eof_t)(void *, xfs_fileoff_t, int, int *); -typedef int (*xfs_iomap_write_direct_t)( - void *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *, int); -typedef int (*xfs_iomap_write_delay_t)( - void *, xfs_off_t, size_t, int, - struct xfs_bmbt_irec *, int *); -typedef int (*xfs_iomap_write_allocate_t)( - void *, xfs_off_t, size_t, - struct xfs_bmbt_irec *, int *); -typedef int (*xfs_iomap_write_unwritten_t)( - void *, xfs_off_t, size_t); -typedef uint (*xfs_lck_map_shared_t)(void *); -typedef void (*xfs_lock_t)(void *, uint); -typedef void (*xfs_lock_demote_t)(void *, uint); -typedef int (*xfs_lock_nowait_t)(void *, uint); -typedef void (*xfs_unlk_t)(void *, unsigned int); -typedef xfs_fsize_t (*xfs_size_t)(void *); -typedef xfs_fsize_t (*xfs_iodone_t)(struct bhv_vfs *); -typedef int (*xfs_swap_extents_t)(void *, void *, - struct xfs_swapext*); - -typedef struct xfs_ioops { - xfs_ioinit_t xfs_ioinit; - xfs_bmapi_t xfs_bmapi_func; - xfs_bunmapi_t xfs_bunmapi_func; - xfs_bmap_eof_t xfs_bmap_eof_func; - xfs_iomap_write_direct_t xfs_iomap_write_direct; - xfs_iomap_write_delay_t xfs_iomap_write_delay; - xfs_iomap_write_allocate_t xfs_iomap_write_allocate; - xfs_iomap_write_unwritten_t xfs_iomap_write_unwritten; - xfs_lock_t xfs_ilock; - xfs_lck_map_shared_t xfs_lck_map_shared; - xfs_lock_demote_t xfs_ilock_demote; - xfs_lock_nowait_t xfs_ilock_nowait; - xfs_unlk_t xfs_unlock; - xfs_size_t xfs_size_func; - xfs_iodone_t xfs_iodone; - xfs_swap_extents_t xfs_swap_extents_func; -} xfs_ioops_t; - -#define XFS_IOINIT(vfsp, args, flags) \ - (*(mp)->m_io_ops.xfs_ioinit)(vfsp, args, flags) -#define XFS_BMAPI(mp, trans,io,bno,len,f,first,tot,mval,nmap,flist,delta) \ - (*(mp)->m_io_ops.xfs_bmapi_func) \ - (trans,(io)->io_obj,bno,len,f,first,tot,mval,nmap,flist,delta) -#define XFS_BUNMAPI(mp, trans,io,bno,len,f,nexts,first,flist,delta,done) \ - (*(mp)->m_io_ops.xfs_bunmapi_func) \ - (trans,(io)->io_obj,bno,len,f,nexts,first,flist,delta,done) -#define XFS_BMAP_EOF(mp, io, endoff, whichfork, eof) \ - (*(mp)->m_io_ops.xfs_bmap_eof_func) \ - ((io)->io_obj, endoff, whichfork, eof) -#define XFS_IOMAP_WRITE_DIRECT(mp, io, offset, count, flags, mval, nmap, found)\ - (*(mp)->m_io_ops.xfs_iomap_write_direct) \ - ((io)->io_obj, offset, count, flags, mval, nmap, found) -#define XFS_IOMAP_WRITE_DELAY(mp, io, offset, count, flags, mval, nmap) \ - (*(mp)->m_io_ops.xfs_iomap_write_delay) \ - ((io)->io_obj, offset, count, flags, mval, nmap) -#define XFS_IOMAP_WRITE_ALLOCATE(mp, io, offset, count, mval, nmap) \ - (*(mp)->m_io_ops.xfs_iomap_write_allocate) \ - ((io)->io_obj, offset, count, mval, nmap) -#define XFS_IOMAP_WRITE_UNWRITTEN(mp, io, offset, count) \ - (*(mp)->m_io_ops.xfs_iomap_write_unwritten) \ - ((io)->io_obj, offset, count) -#define XFS_LCK_MAP_SHARED(mp, io) \ - (*(mp)->m_io_ops.xfs_lck_map_shared)((io)->io_obj) -#define XFS_ILOCK(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock)((io)->io_obj, mode) -#define XFS_ILOCK_NOWAIT(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock_nowait)((io)->io_obj, mode) -#define XFS_IUNLOCK(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_unlock)((io)->io_obj, mode) -#define XFS_ILOCK_DEMOTE(mp, io, mode) \ - (*(mp)->m_io_ops.xfs_ilock_demote)((io)->io_obj, mode) -#define XFS_SIZE(mp, io) \ - (*(mp)->m_io_ops.xfs_size_func)((io)->io_obj) -#define XFS_IODONE(vfsp) \ - (*(mp)->m_io_ops.xfs_iodone)(vfsp) -#define XFS_SWAP_EXTENTS(mp, io, tio, sxp) \ - (*(mp)->m_io_ops.xfs_swap_extents_func) \ - ((io)->io_obj, (tio)->io_obj, sxp) +struct xfs_dir_ops; +struct xfs_da_geometry; #ifdef HAVE_PERCPU_SB @@ -302,25 +47,42 @@ typedef struct xfs_icsb_cnts { #define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ -#define XFS_ICSB_SB_LOCKED (1 << 0) /* sb already locked */ #define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters_lazy(struct xfs_mount *); +extern void xfs_icsb_reinit_counters(struct xfs_mount *); +extern void xfs_icsb_destroy_counters(struct xfs_mount *); +extern void xfs_icsb_sync_counters(struct xfs_mount *, int); +extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); +extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, + int64_t, int); #else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_sync_counters_lazy(mp) do { } while (0) +#define xfs_icsb_init_counters(mp) (0) +#define xfs_icsb_destroy_counters(mp) do { } while (0) +#define xfs_icsb_reinit_counters(mp) do { } while (0) +#define xfs_icsb_sync_counters(mp, flags) do { } while (0) +#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) +#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ + xfs_mod_incore_sb(mp, field, delta, rsvd) #endif +/* dynamic preallocation free space thresholds, 5% down to 1% */ +enum { + XFS_LOWSP_1_PCNT = 0, + XFS_LOWSP_2_PCNT, + XFS_LOWSP_3_PCNT, + XFS_LOWSP_4_PCNT, + XFS_LOWSP_5_PCNT, + XFS_LOWSP_MAX, +}; + typedef struct xfs_mount { - bhv_desc_t m_bhv; /* vfs xfs behavior */ + struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ - AIL_LOCK_T m_ail_lock; /* fs AIL mutex */ - xfs_ail_entry_t m_ail; /* fs active log item list */ - uint m_ail_gen; /* fs AIL generation count */ + struct xfs_ail *m_ail; /* fs active log item list */ xfs_sb_t m_sb; /* copy of fs superblock */ - lock_t m_sb_lock; /* sb counter mutex */ + spinlock_t m_sb_lock; /* sb counter lock */ struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -329,19 +91,15 @@ typedef struct xfs_mount { int m_bsize; /* fs logical block size */ xfs_agnumber_t m_agfrotor; /* last ag where space found */ xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ - lock_t m_agirotor_lock;/* .. and lock protecting it */ + spinlock_t m_agirotor_lock;/* .. and lock protecting it */ xfs_agnumber_t m_maxagi; /* highest inode alloc group */ - uint m_ihsize; /* size of next field */ - struct xfs_ihash *m_ihash; /* fs private inode hash table*/ - struct xfs_inode *m_inodes; /* active inode list */ - struct list_head m_del_inodes; /* inodes to reclaim */ - mutex_t m_ilock; /* inode list mutex */ - uint m_ireclaims; /* count of calls to reclaim*/ uint m_readio_log; /* min read size log bytes */ uint m_readio_blocks; /* min read size blocks */ uint m_writeio_log; /* min write size log bytes */ uint m_writeio_blocks; /* min write size blocks */ - struct log *m_log; /* log specific stuff */ + struct xfs_da_geometry *m_dir_geo; /* directory block geometry */ + struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */ + struct xlog *m_log; /* log specific stuff */ int m_logbufs; /* number of log buffers */ int m_logbsize; /* size of each log buffer */ uint m_rsumlevels; /* rt summary levels */ @@ -353,112 +111,102 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - __uint8_t m_dircook_elog; /* log d-cookie entry bits */ __uint8_t m_blkbit_log; /* blocklog + NBBY */ __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ __uint8_t m_agno_log; /* log #ag's */ __uint8_t m_agino_log; /* #bits for agino in inum */ - __uint8_t m_nreadaheads; /* #readahead buffers */ - __uint16_t m_inode_cluster_size;/* min inode buf size */ + uint m_inode_cluster_size;/* min inode buf size */ uint m_blockmask; /* sb_blocksize-1 */ uint m_blockwsize; /* sb_blocksize in words */ uint m_blockwmask; /* blockwsize-1 */ - uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */ - uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */ - uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */ - uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */ - uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */ - uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */ + uint m_alloc_mxr[2]; /* max alloc btree records */ + uint m_alloc_mnr[2]; /* min alloc btree records */ + uint m_bmap_dmxr[2]; /* max bmap btree records */ + uint m_bmap_dmnr[2]; /* min bmap btree records */ + uint m_inobt_mxr[2]; /* max inobt btree records */ + uint m_inobt_mnr[2]; /* min inobt btree records */ uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ - uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */ - struct xfs_perag *m_perag; /* per-ag accounting info */ - struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */ - sema_t m_growlock; /* growfs mutex */ + uint m_in_maxlevels; /* max inobt btree levels. */ + struct radix_tree_root m_perag_tree; /* per-ag accounting info */ + spinlock_t m_perag_lock; /* lock for m_perag_tree */ + struct mutex m_growlock; /* growfs mutex */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ - uint m_attroffset; /* inode attribute offset */ - uint m_dir_node_ents; /* #entries in a dir danode */ - uint m_attr_node_ents; /* #entries in attr danode */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ - int m_litino; /* size of inode union area */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ - xfs_trans_reservations_t m_reservations;/* precomputed res values */ + struct xfs_trans_resv m_resv; /* precomputed res values */ __uint64_t m_maxicount; /* maximum inode count */ - __uint64_t m_maxioffset; /* maximum inode offset */ __uint64_t m_resblks; /* total reserved blocks */ __uint64_t m_resblks_avail;/* available reserved blocks */ -#if XFS_BIG_INUMS - xfs_ino_t m_inoadd; /* add value for ino64_offset */ -#endif + __uint64_t m_resblks_save; /* reserved blks @ remount,ro */ int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ int m_sinoalign; /* stripe unit inode alignment */ - int m_attr_magicpct;/* 37% of the blocksize */ - int m_dir_magicpct; /* 37% of the dir blocksize */ - __uint8_t m_mk_sharedro; /* mark shared ro on unmount */ - __uint8_t m_inode_quiesce;/* call quiesce on new inodes. - field governed by m_ilock */ __uint8_t m_sectbb_log; /* sectlog - BBSHIFT */ - int m_dirblksize; /* directory block sz--bytes */ - int m_dirblkfsbs; /* directory block sz--fsbs */ - xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */ - xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */ - xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */ + const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */ + const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */ + const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ - struct xfs_chash *m_chash; /* fs private inode per-cluster - * hash table */ - struct xfs_dmops m_dm_ops; /* vector of DMI ops */ - struct xfs_qmops m_qm_ops; /* vector of XQM ops */ - struct xfs_ioops m_io_ops; /* vector of I/O ops */ atomic_t m_active_trans; /* number trans frozen */ #ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ + xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ unsigned long m_icsb_counters; /* disabled per-cpu counters */ struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ + struct mutex m_icsb_mutex; /* balancer sync lock */ #endif + struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ + struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct delayed_work m_eofblocks_work; /* background eof blocks + trimming */ + __int64_t m_update_flags; /* sb flags we need to update + on the next remount,rw */ + int64_t m_low_space[XFS_LOWSP_MAX]; + /* low free space thresholds */ + + struct workqueue_struct *m_data_workqueue; + struct workqueue_struct *m_unwritten_workqueue; + struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; } xfs_mount_t; /* * Flags for m_flags. */ -#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops +#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops must be synchronous except for space allocations */ -#define XFS_MOUNT_INO64 (1ULL << 1) - /* (1ULL << 2) -- currently unused */ - /* (1ULL << 3) -- currently unused */ +#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ -#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to - user */ +#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment allocations */ #define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ - /* (1ULL << 9) -- currently unused */ +#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ #define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */ #define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */ -#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */ - /* osyncisdsync is now default*/ #define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above * 32 bits in size */ - /* (1ULL << 15) -- currently unused */ +#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */ #define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ #define XFS_MOUNT_BARRIER (1ULL << 17) -#define XFS_MOUNT_IDELETE (1ULL << 18) /* delete empty inode clusters*/ +#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ #define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width * allocation */ -#define XFS_MOUNT_IHASHSIZE (1ULL << 20) /* inode hash table size */ +#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred * I/O size in stat() */ -#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock - counters */ +#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams + allocator */ +#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ /* @@ -478,12 +226,12 @@ typedef struct xfs_mount { * Synchronous read and write sizes. This should be * better for NFSv2 wsync filesystems. */ -#define XFS_WSYNC_READIO_LOG 15 /* 32K */ -#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */ +#define XFS_WSYNC_READIO_LOG 15 /* 32k */ +#define XFS_WSYNC_WRITEIO_LOG 14 /* 16k */ /* * Allow large block sizes to be reported to userspace programs if the - * "largeio" mount option is used. + * "largeio" mount option is used. * * If compatibility mode is specified, simply return the basic unit of caching * so that we don't get inefficient read/modify/write I/O from user apps. @@ -505,46 +253,26 @@ xfs_preferred_iosize(xfs_mount_t *mp) PAGE_CACHE_SIZE)); } -#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset) - +#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ + ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) #define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, + int lnnum); #define xfs_force_shutdown(m,f) \ - bhv_vfs_force_shutdown((XFS_MTOVFS(m)), f, __FILE__, __LINE__) + xfs_do_force_shutdown(m, f, __FILE__, __LINE__) + +#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */ +#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */ +#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ +#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */ +#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */ /* * Flags for xfs_mountfs */ -#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ -#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ -/* XFS_MFSI_RRINODES */ -#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ - /* log recovery */ -#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ -/* XFS_MFSI_CONVERT_SUNIT */ #define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ -/* - * Macros for getting from mount to vfs and back. - */ -#define XFS_MTOVFS(mp) xfs_mtovfs(mp) -static inline struct bhv_vfs *xfs_mtovfs(xfs_mount_t *mp) -{ - return bhvtovfs(&mp->m_bhv); -} - -#define XFS_BHVTOM(bdp) xfs_bhvtom(bdp) -static inline xfs_mount_t *xfs_bhvtom(bhv_desc_t *bdp) -{ - return (xfs_mount_t *)BHV_PDATA(bdp); -} - -#define XFS_VFSTOM(vfs) xfs_vfstom(vfs) -static inline xfs_mount_t *xfs_vfstom(bhv_vfs_t *vfs) -{ - return XFS_BHVTOM(bhv_lookup(VFS_BHVHEAD(vfs), &xfs_vfsops)); -} - -#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d) static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { @@ -553,7 +281,6 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) return (xfs_agnumber_t) ld; } -#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { @@ -562,49 +289,106 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* + * Per-cpu superblock locking functions + */ +#ifdef HAVE_PERCPU_SB +static inline void +xfs_icsb_lock(xfs_mount_t *mp) +{ + mutex_lock(&mp->m_icsb_mutex); +} + +static inline void +xfs_icsb_unlock(xfs_mount_t *mp) +{ + mutex_unlock(&mp->m_icsb_mutex); +} +#else +#define xfs_icsb_lock(mp) +#define xfs_icsb_unlock(mp) +#endif + +/* * This structure is for use by the xfs_mod_incore_sb_batch() routine. + * xfs_growfs can specify a few fields which are more than int limit */ typedef struct xfs_mod_sb { xfs_sb_field_t msb_field; /* Field to modify, see below */ - int msb_delta; /* Change to make to specified field */ + int64_t msb_delta; /* Change to make to specified field */ } xfs_mod_sb_t; -#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock)) -#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock)) -#define XFS_SB_LOCK(mp) mutex_spinlock(&(mp)->m_sb_lock) -#define XFS_SB_UNLOCK(mp,s) mutex_spinunlock(&(mp)->m_sb_lock,(s)) - -extern xfs_mount_t *xfs_mount_init(void); -extern void xfs_mod_sb(xfs_trans_t *, __int64_t); -extern void xfs_mount_free(xfs_mount_t *mp, int remove_bhv); -extern int xfs_mountfs(struct bhv_vfs *, xfs_mount_t *mp, int); -extern void xfs_mountfs_check_barriers(xfs_mount_t *mp); - -extern int xfs_unmountfs(xfs_mount_t *, struct cred *); -extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *); -extern int xfs_unmountfs_writesb(xfs_mount_t *); -extern int xfs_unmount_flush(xfs_mount_t *, int); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int); -extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, - int, int); +/* + * Per-ag incore structure, copies of information in agf and agi, to improve the + * performance of allocation group selection. This is defined for the kernel + * only, and hence is defined here instead of in xfs_ag.h. You need the struct + * xfs_mount to be defined to look up a xfs_perag anyway (via mp->m_perag_tree), + * so this doesn't introduce any strange header file dependencies. + */ +typedef struct xfs_perag { + struct xfs_mount *pag_mount; /* owner filesystem */ + xfs_agnumber_t pag_agno; /* AG this structure belongs to */ + atomic_t pag_ref; /* perag reference count */ + char pagf_init; /* this agf's entry is initialized */ + char pagi_init; /* this agi's entry is initialized */ + char pagf_metadata; /* the agf is preferred to be metadata */ + char pagi_inodeok; /* The agi is ok for inodes */ + __uint8_t pagf_levels[XFS_BTNUM_AGF]; + /* # of levels in bno & cnt btree */ + __uint32_t pagf_flcount; /* count of blocks in freelist */ + xfs_extlen_t pagf_freeblks; /* total free blocks */ + xfs_extlen_t pagf_longest; /* longest free space */ + __uint32_t pagf_btreeblks; /* # of blocks held in AGF btrees */ + xfs_agino_t pagi_freecount; /* number of free inodes */ + xfs_agino_t pagi_count; /* number of allocated inodes */ + + /* + * Inode allocation search lookup optimisation. + * If the pagino matches, the search for new inodes + * doesn't need to search the near ones again straight away + */ + xfs_agino_t pagl_pagino; + xfs_agino_t pagl_leftrec; + xfs_agino_t pagl_rightrec; + spinlock_t pagb_lock; /* lock for pagb_tree */ + struct rb_root pagb_tree; /* ordered tree of busy extents */ + + atomic_t pagf_fstrms; /* # of filestreams active in this AG */ + + spinlock_t pag_ici_lock; /* incore inode cache lock */ + struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ + struct mutex pag_ici_reclaim_lock; /* serialisation point */ + unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ + + /* buffer cache index */ + spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ + struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + + /* for rcu-safe freeing */ + struct rcu_head rcu_head; + int pagb_count; /* pagb slots in use */ +} xfs_perag_t; + +extern int xfs_log_sbcount(xfs_mount_t *); +extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); +extern int xfs_mountfs(xfs_mount_t *mp); +extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi); + +extern void xfs_unmountfs(xfs_mount_t *); +extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); +extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); -extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); -extern int xfs_syncsub(xfs_mount_t *, int, int, int *); -extern int xfs_sync_inodes(xfs_mount_t *, int, int, int *); -extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, - xfs_agnumber_t); -extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); - -extern struct xfs_dmops xfs_dmcore_stub; -extern struct xfs_qmops xfs_qmcore_stub; -extern struct xfs_ioops xfs_iocore_xfs; - -extern int xfs_init(void); -extern void xfs_cleanup(void); +extern int xfs_fs_writable(xfs_mount_t *); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); + +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); + +extern void xfs_set_low_space_thresholds(struct xfs_mount *); #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c new file mode 100644 index 00000000000..f99b4933dc2 --- /dev/null +++ b/fs/xfs/xfs_mru_cache.c @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_mru_cache.h" + +/* + * The MRU Cache data structure consists of a data store, an array of lists and + * a lock to protect its internal state. At initialisation time, the client + * supplies an element lifetime in milliseconds and a group count, as well as a + * function pointer to call when deleting elements. A data structure for + * queueing up work in the form of timed callbacks is also included. + * + * The group count controls how many lists are created, and thereby how finely + * the elements are grouped in time. When reaping occurs, all the elements in + * all the lists whose time has expired are deleted. + * + * To give an example of how this works in practice, consider a client that + * initialises an MRU Cache with a lifetime of ten seconds and a group count of + * five. Five internal lists will be created, each representing a two second + * period in time. When the first element is added, time zero for the data + * structure is initialised to the current time. + * + * All the elements added in the first two seconds are appended to the first + * list. Elements added in the third second go into the second list, and so on. + * If an element is accessed at any point, it is removed from its list and + * inserted at the head of the current most-recently-used list. + * + * The reaper function will have nothing to do until at least twelve seconds + * have elapsed since the first element was added. The reason for this is that + * if it were called at t=11s, there could be elements in the first list that + * have only been inactive for nine seconds, so it still does nothing. If it is + * called anywhere between t=12 and t=14 seconds, it will delete all the + * elements that remain in the first list. It's therefore possible for elements + * to remain in the data store even after they've been inactive for up to + * (t + t/g) seconds, where t is the inactive element lifetime and g is the + * number of groups. + * + * The above example assumes that the reaper function gets called at least once + * every (t/g) seconds. If it is called less frequently, unused elements will + * accumulate in the reap list until the reaper function is eventually called. + * The current implementation uses work queue callbacks to carefully time the + * reaper function calls, so this should happen rarely, if at all. + * + * From a design perspective, the primary reason for the choice of a list array + * representing discrete time intervals is that it's only practical to reap + * expired elements in groups of some appreciable size. This automatically + * introduces a granularity to element lifetimes, so there's no point storing an + * individual timeout with each element that specifies a more precise reap time. + * The bonus is a saving of sizeof(long) bytes of memory per element stored. + * + * The elements could have been stored in just one list, but an array of + * counters or pointers would need to be maintained to allow them to be divided + * up into discrete time groups. More critically, the process of touching or + * removing an element would involve walking large portions of the entire list, + * which would have a detrimental effect on performance. The additional memory + * requirement for the array of list heads is minimal. + * + * When an element is touched or deleted, it needs to be removed from its + * current list. Doubly linked lists are used to make the list maintenance + * portion of these operations O(1). Since reaper timing can be imprecise, + * inserts and lookups can occur when there are no free lists available. When + * this happens, all the elements on the LRU list need to be migrated to the end + * of the reap list. To keep the list maintenance portion of these operations + * O(1) also, list tails need to be accessible without walking the entire list. + * This is the reason why doubly linked list heads are used. + */ + +/* + * An MRU Cache is a dynamic data structure that stores its elements in a way + * that allows efficient lookups, but also groups them into discrete time + * intervals based on insertion time. This allows elements to be efficiently + * and automatically reaped after a fixed period of inactivity. + * + * When a client data pointer is stored in the MRU Cache it needs to be added to + * both the data store and to one of the lists. It must also be possible to + * access each of these entries via the other, i.e. to: + * + * a) Walk a list, removing the corresponding data store entry for each item. + * b) Look up a data store entry, then access its list entry directly. + * + * To achieve both of these goals, each entry must contain both a list entry and + * a key, in addition to the user's data pointer. Note that it's not a good + * idea to have the client embed one of these structures at the top of their own + * data structure, because inserting the same item more than once would most + * likely result in a loop in one of the lists. That's a sure-fire recipe for + * an infinite loop in the code. + */ +struct xfs_mru_cache { + struct radix_tree_root store; /* Core storage data structure. */ + struct list_head *lists; /* Array of lists, one per grp. */ + struct list_head reap_list; /* Elements overdue for reaping. */ + spinlock_t lock; /* Lock to protect this struct. */ + unsigned int grp_count; /* Number of discrete groups. */ + unsigned int grp_time; /* Time period spanned by grps. */ + unsigned int lru_grp; /* Group containing time zero. */ + unsigned long time_zero; /* Time first element was added. */ + xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */ + struct delayed_work work; /* Workqueue data for reaping. */ + unsigned int queued; /* work has been queued */ +}; + +static struct workqueue_struct *xfs_mru_reap_wq; + +/* + * When inserting, destroying or reaping, it's first necessary to update the + * lists relative to a particular time. In the case of destroying, that time + * will be well in the future to ensure that all items are moved to the reap + * list. In all other cases though, the time will be the current time. + * + * This function enters a loop, moving the contents of the LRU list to the reap + * list again and again until either a) the lists are all empty, or b) time zero + * has been advanced sufficiently to be within the immediate element lifetime. + * + * Case a) above is detected by counting how many groups are migrated and + * stopping when they've all been moved. Case b) is detected by monitoring the + * time_zero field, which is updated as each group is migrated. + * + * The return value is the earliest time that more migration could be needed, or + * zero if there's no need to schedule more work because the lists are empty. + */ +STATIC unsigned long +_xfs_mru_cache_migrate( + struct xfs_mru_cache *mru, + unsigned long now) +{ + unsigned int grp; + unsigned int migrated = 0; + struct list_head *lru_list; + + /* Nothing to do if the data store is empty. */ + if (!mru->time_zero) + return 0; + + /* While time zero is older than the time spanned by all the lists. */ + while (mru->time_zero <= now - mru->grp_count * mru->grp_time) { + + /* + * If the LRU list isn't empty, migrate its elements to the tail + * of the reap list. + */ + lru_list = mru->lists + mru->lru_grp; + if (!list_empty(lru_list)) + list_splice_init(lru_list, mru->reap_list.prev); + + /* + * Advance the LRU group number, freeing the old LRU list to + * become the new MRU list; advance time zero accordingly. + */ + mru->lru_grp = (mru->lru_grp + 1) % mru->grp_count; + mru->time_zero += mru->grp_time; + + /* + * If reaping is so far behind that all the elements on all the + * lists have been migrated to the reap list, it's now empty. + */ + if (++migrated == mru->grp_count) { + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; + } + } + + /* Find the first non-empty list from the LRU end. */ + for (grp = 0; grp < mru->grp_count; grp++) { + + /* Check the grp'th list from the LRU end. */ + lru_list = mru->lists + ((mru->lru_grp + grp) % mru->grp_count); + if (!list_empty(lru_list)) + return mru->time_zero + + (mru->grp_count + grp) * mru->grp_time; + } + + /* All the lists must be empty. */ + mru->lru_grp = 0; + mru->time_zero = 0; + return 0; +} + +/* + * When inserting or doing a lookup, an element needs to be inserted into the + * MRU list. The lists must be migrated first to ensure that they're + * up-to-date, otherwise the new element could be given a shorter lifetime in + * the cache than it should. + */ +STATIC void +_xfs_mru_cache_list_insert( + struct xfs_mru_cache *mru, + struct xfs_mru_cache_elem *elem) +{ + unsigned int grp = 0; + unsigned long now = jiffies; + + /* + * If the data store is empty, initialise time zero, leave grp set to + * zero and start the work queue timer if necessary. Otherwise, set grp + * to the number of group times that have elapsed since time zero. + */ + if (!_xfs_mru_cache_migrate(mru, now)) { + mru->time_zero = now; + if (!mru->queued) { + mru->queued = 1; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, + mru->grp_count * mru->grp_time); + } + } else { + grp = (now - mru->time_zero) / mru->grp_time; + grp = (mru->lru_grp + grp) % mru->grp_count; + } + + /* Insert the element at the tail of the corresponding list. */ + list_add_tail(&elem->list_node, mru->lists + grp); +} + +/* + * When destroying or reaping, all the elements that were migrated to the reap + * list need to be deleted. For each element this involves removing it from the + * data store, removing it from the reap list, calling the client's free + * function and deleting the element from the element zone. + * + * We get called holding the mru->lock, which we drop and then reacquire. + * Sparse need special help with this to tell it we know what we are doing. + */ +STATIC void +_xfs_mru_cache_clear_reap_list( + struct xfs_mru_cache *mru) + __releases(mru->lock) __acquires(mru->lock) +{ + struct xfs_mru_cache_elem *elem, *next; + struct list_head tmp; + + INIT_LIST_HEAD(&tmp); + list_for_each_entry_safe(elem, next, &mru->reap_list, list_node) { + + /* Remove the element from the data store. */ + radix_tree_delete(&mru->store, elem->key); + + /* + * remove to temp list so it can be freed without + * needing to hold the lock + */ + list_move(&elem->list_node, &tmp); + } + spin_unlock(&mru->lock); + + list_for_each_entry_safe(elem, next, &tmp, list_node) { + list_del_init(&elem->list_node); + mru->free_func(elem); + } + + spin_lock(&mru->lock); +} + +/* + * We fire the reap timer every group expiry interval so + * we always have a reaper ready to run. This makes shutdown + * and flushing of the reaper easy to do. Hence we need to + * keep when the next reap must occur so we can determine + * at each interval whether there is anything we need to do. + */ +STATIC void +_xfs_mru_cache_reap( + struct work_struct *work) +{ + struct xfs_mru_cache *mru = + container_of(work, struct xfs_mru_cache, work.work); + unsigned long now, next; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + next = _xfs_mru_cache_migrate(mru, jiffies); + _xfs_mru_cache_clear_reap_list(mru); + + mru->queued = next; + if ((mru->queued > 0)) { + now = jiffies; + if (next <= now) + next = 0; + else + next -= now; + queue_delayed_work(xfs_mru_reap_wq, &mru->work, next); + } + + spin_unlock(&mru->lock); +} + +int +xfs_mru_cache_init(void) +{ + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); + if (!xfs_mru_reap_wq) + return -ENOMEM; + return 0; +} + +void +xfs_mru_cache_uninit(void) +{ + destroy_workqueue(xfs_mru_reap_wq); +} + +/* + * To initialise a struct xfs_mru_cache pointer, call xfs_mru_cache_create() + * with the address of the pointer, a lifetime value in milliseconds, a group + * count and a free function to use when deleting elements. This function + * returns 0 if the initialisation was successful. + */ +int +xfs_mru_cache_create( + struct xfs_mru_cache **mrup, + unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func) +{ + struct xfs_mru_cache *mru = NULL; + int err = 0, grp; + unsigned int grp_time; + + if (mrup) + *mrup = NULL; + + if (!mrup || !grp_count || !lifetime_ms || !free_func) + return EINVAL; + + if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) + return EINVAL; + + if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) + return ENOMEM; + + /* An extra list is needed to avoid reaping up to a grp_time early. */ + mru->grp_count = grp_count + 1; + mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); + + if (!mru->lists) { + err = ENOMEM; + goto exit; + } + + for (grp = 0; grp < mru->grp_count; grp++) + INIT_LIST_HEAD(mru->lists + grp); + + /* + * We use GFP_KERNEL radix tree preload and do inserts under a + * spinlock so GFP_ATOMIC is appropriate for the radix tree itself. + */ + INIT_RADIX_TREE(&mru->store, GFP_ATOMIC); + INIT_LIST_HEAD(&mru->reap_list); + spin_lock_init(&mru->lock); + INIT_DELAYED_WORK(&mru->work, _xfs_mru_cache_reap); + + mru->grp_time = grp_time; + mru->free_func = free_func; + + *mrup = mru; + +exit: + if (err && mru && mru->lists) + kmem_free(mru->lists); + if (err && mru) + kmem_free(mru); + + return err; +} + +/* + * Call xfs_mru_cache_flush() to flush out all cached entries, calling their + * free functions as they're deleted. When this function returns, the caller is + * guaranteed that all the free functions for all the elements have finished + * executing and the reaper is not running. + */ +static void +xfs_mru_cache_flush( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + spin_lock(&mru->lock); + if (mru->queued) { + spin_unlock(&mru->lock); + cancel_delayed_work_sync(&mru->work); + spin_lock(&mru->lock); + } + + _xfs_mru_cache_migrate(mru, jiffies + mru->grp_count * mru->grp_time); + _xfs_mru_cache_clear_reap_list(mru); + + spin_unlock(&mru->lock); +} + +void +xfs_mru_cache_destroy( + struct xfs_mru_cache *mru) +{ + if (!mru || !mru->lists) + return; + + xfs_mru_cache_flush(mru); + + kmem_free(mru->lists); + kmem_free(mru); +} + +/* + * To insert an element, call xfs_mru_cache_insert() with the data store, the + * element's key and the client data pointer. This function returns 0 on + * success or ENOMEM if memory for the data element couldn't be allocated. + */ +int +xfs_mru_cache_insert( + struct xfs_mru_cache *mru, + unsigned long key, + struct xfs_mru_cache_elem *elem) +{ + int error; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return EINVAL; + + if (radix_tree_preload(GFP_KERNEL)) + return ENOMEM; + + INIT_LIST_HEAD(&elem->list_node); + elem->key = key; + + spin_lock(&mru->lock); + error = -radix_tree_insert(&mru->store, key, elem); + radix_tree_preload_end(); + if (!error) + _xfs_mru_cache_list_insert(mru, elem); + spin_unlock(&mru->lock); + + return error; +} + +/* + * To remove an element without calling the free function, call + * xfs_mru_cache_remove() with the data store and the element's key. On success + * the client data pointer for the removed element is returned, otherwise this + * function will return a NULL pointer. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_remove( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_delete(&mru->store, key); + if (elem) + list_del(&elem->list_node); + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To remove and element and call the free function, call xfs_mru_cache_delete() + * with the data store and the element's key. + */ +void +xfs_mru_cache_delete( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + elem = xfs_mru_cache_remove(mru, key); + if (elem) + mru->free_func(elem); +} + +/* + * To look up an element using its key, call xfs_mru_cache_lookup() with the + * data store and the element's key. If found, the element will be moved to the + * head of the MRU list to indicate that it's been touched. + * + * The internal data structures are protected by a spinlock that is STILL HELD + * when this function returns. Call xfs_mru_cache_done() to release it. Note + * that it is not safe to call any function that might sleep in the interim. + * + * The implementation could have used reference counting to avoid this + * restriction, but since most clients simply want to get, set or test a member + * of the returned data structure, the extra per-element memory isn't warranted. + * + * If the element isn't found, this function returns NULL and the spinlock is + * released. xfs_mru_cache_done() should NOT be called when this occurs. + * + * Because sparse isn't smart enough to know about conditional lock return + * status, we need to help it get it right by annotating the path that does + * not release the lock. + */ +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup( + struct xfs_mru_cache *mru, + unsigned long key) +{ + struct xfs_mru_cache_elem *elem; + + ASSERT(mru && mru->lists); + if (!mru || !mru->lists) + return NULL; + + spin_lock(&mru->lock); + elem = radix_tree_lookup(&mru->store, key); + if (elem) { + list_del(&elem->list_node); + _xfs_mru_cache_list_insert(mru, elem); + __release(mru_lock); /* help sparse not be stupid */ + } else + spin_unlock(&mru->lock); + + return elem; +} + +/* + * To release the internal data structure spinlock after having performed an + * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() + * with the data store pointer. + */ +void +xfs_mru_cache_done( + struct xfs_mru_cache *mru) + __releases(mru->lock) +{ + spin_unlock(&mru->lock); +} diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h new file mode 100644 index 00000000000..fb5245ba5ff --- /dev/null +++ b/fs/xfs/xfs_mru_cache.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2006-2007 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_MRU_CACHE_H__ +#define __XFS_MRU_CACHE_H__ + +struct xfs_mru_cache; + +struct xfs_mru_cache_elem { + struct list_head list_node; + unsigned long key; +}; + +/* Function pointer type for callback to free a client's data pointer. */ +typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem); + +int xfs_mru_cache_init(void); +void xfs_mru_cache_uninit(void); +int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms, + unsigned int grp_count, + xfs_mru_cache_free_func_t free_func); +void xfs_mru_cache_destroy(struct xfs_mru_cache *mru); +int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key, + struct xfs_mru_cache_elem *elem); +struct xfs_mru_cache_elem * +xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); +struct xfs_mru_cache_elem * +xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); +void xfs_mru_cache_done(struct xfs_mru_cache *mru); + +#endif /* __XFS_MRU_CACHE_H__ */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c new file mode 100644 index 00000000000..6d26759c779 --- /dev/null +++ b/fs/xfs/xfs_qm.c @@ -0,0 +1,1966 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_quota.h" +#include "xfs_error.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_cksum.h" +#include "xfs_dinode.h" + +/* + * The global quota manager. There is only one of these for the entire + * system, _not_ one per file system. XQM keeps track of the overall + * quota functionality, including maintaining the freelist and hash + * tables of dquots. + */ +STATIC int xfs_qm_init_quotainos(xfs_mount_t *); +STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); + + +STATIC void xfs_qm_dqfree_one(struct xfs_dquot *dqp); +/* + * We use the batch lookup interface to iterate over the dquots as it + * currently is the only interface into the radix tree code that allows + * fuzzy lookups instead of exact matches. Holding the lock over multiple + * operations is fine as all callers are used either during mount/umount + * or quotaoff. + */ +#define XFS_DQ_LOOKUP_BATCH 32 + +STATIC int +xfs_qm_dquot_walk( + struct xfs_mount *mp, + int type, + int (*execute)(struct xfs_dquot *dqp, void *data), + void *data) +{ + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + uint32_t next_index; + int last_error = 0; + int skipped; + int nr_found; + +restart: + skipped = 0; + next_index = 0; + nr_found = 0; + + while (1) { + struct xfs_dquot *batch[XFS_DQ_LOOKUP_BATCH]; + int error = 0; + int i; + + mutex_lock(&qi->qi_tree_lock); + nr_found = radix_tree_gang_lookup(tree, (void **)batch, + next_index, XFS_DQ_LOOKUP_BATCH); + if (!nr_found) { + mutex_unlock(&qi->qi_tree_lock); + break; + } + + for (i = 0; i < nr_found; i++) { + struct xfs_dquot *dqp = batch[i]; + + next_index = be32_to_cpu(dqp->q_core.d_id) + 1; + + error = execute(batch[i], data); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + mutex_unlock(&qi->qi_tree_lock); + + /* bail out if the filesystem is corrupted. */ + if (last_error == EFSCORRUPTED) { + skipped = 0; + break; + } + } + + if (skipped) { + delay(1); + goto restart; + } + + return last_error; +} + + +/* + * Purge a dquot from all tracking data structures and free it. + */ +STATIC int +xfs_qm_dqpurge( + struct xfs_dquot *dqp, + void *data) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + xfs_dqlock(dqp); + if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { + xfs_dqunlock(dqp); + return EAGAIN; + } + + dqp->dq_flags |= XFS_DQ_FREEING; + + xfs_dqflock(dqp); + + /* + * If we are turning this type of quotas off, we don't care + * about the dirty metadata sitting in this dquot. OTOH, if + * we're unmounting, we do care, so we flush it and wait. + */ + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + /* + * We don't care about getting disk errors here. We need + * to purge this dquot anyway, so we go ahead regardless. + */ + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(mp, "%s: dquot %p flush failed", + __func__, dqp); + } else { + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + } + xfs_dqflock(dqp); + } + + ASSERT(atomic_read(&dqp->q_pincount) == 0); + ASSERT(XFS_FORCED_SHUTDOWN(mp) || + !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + + xfs_dqfunlock(dqp); + xfs_dqunlock(dqp); + + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + qi->qi_dquots--; + + /* + * We move dquots to the freelist as soon as their reference count + * hits zero, so it really should be on the freelist here. + */ + ASSERT(!list_empty(&dqp->q_lru)); + list_lru_del(&qi->qi_lru, &dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + + xfs_qm_dqdestroy(dqp); + return 0; +} + +/* + * Purge the dquot cache. + */ +void +xfs_qm_dqpurge_all( + struct xfs_mount *mp, + uint flags) +{ + if (flags & XFS_QMOPT_UQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_GQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL); + if (flags & XFS_QMOPT_PQUOTA) + xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL); +} + +/* + * Just destroy the quotainfo structure. + */ +void +xfs_qm_unmount( + struct xfs_mount *mp) +{ + if (mp->m_quotainfo) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_destroy_quotainfo(mp); + } +} + + +/* + * This is called from xfs_mountfs to start quotas and initialize all + * necessary data structures like quotainfo. This is also responsible for + * running a quotacheck as necessary. We are guaranteed that the superblock + * is consistently read in at this point. + * + * If we fail here, the mount will continue with quota turned off. We don't + * need to inidicate success or failure at all. + */ +void +xfs_qm_mount_quotas( + xfs_mount_t *mp) +{ + int error = 0; + uint sbf; + + /* + * If quotas on realtime volumes is not supported, we disable + * quotas immediately. + */ + if (mp->m_sb.sb_rextents) { + xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); + mp->m_qflags = 0; + goto write_changes; + } + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * Allocate the quotainfo structure inside the mount struct, and + * create quotainode(s), and change/rev superblock if necessary. + */ + error = xfs_qm_init_quotainfo(mp); + if (error) { + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo == NULL); + mp->m_qflags = 0; + goto write_changes; + } + /* + * If any of the quotas are not consistent, do a quotacheck. + */ + if (XFS_QM_NEED_QUOTACHECK(mp)) { + error = xfs_qm_quotacheck(mp); + if (error) { + /* Quotacheck failed and disabled quotas. */ + return; + } + } + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + if (!XFS_IS_UQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_UQUOTA_CHKD; + if (!XFS_IS_GQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_GQUOTA_CHKD; + if (!XFS_IS_PQUOTA_ON(mp)) + mp->m_qflags &= ~XFS_PQUOTA_CHKD; + + write_changes: + /* + * We actually don't have to acquire the m_sb_lock at all. + * This can only be called from mount, and that's single threaded. XXX + */ + spin_lock(&mp->m_sb_lock); + sbf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) { + if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) { + /* + * We could only have been turning quotas off. + * We aren't in very good shape actually because + * the incore structures are convinced that quotas are + * off, but the on disk superblock doesn't know that ! + */ + ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + xfs_alert(mp, "%s: Superblock update failed!", + __func__); + } + } + + if (error) { + xfs_warn(mp, "Failed to initialize disk quotas."); + return; + } +} + +/* + * Called from the vfsops layer. + */ +void +xfs_qm_unmount_quotas( + xfs_mount_t *mp) +{ + /* + * Release the dquots that root inode, et al might be holding, + * before we flush quotas and blow away the quotainfo structure. + */ + ASSERT(mp->m_rootip); + xfs_qm_dqdetach(mp->m_rootip); + if (mp->m_rbmip) + xfs_qm_dqdetach(mp->m_rbmip); + if (mp->m_rsumip) + xfs_qm_dqdetach(mp->m_rsumip); + + /* + * Release the quota inodes. + */ + if (mp->m_quotainfo) { + if (mp->m_quotainfo->qi_uquotaip) { + IRELE(mp->m_quotainfo->qi_uquotaip); + mp->m_quotainfo->qi_uquotaip = NULL; + } + if (mp->m_quotainfo->qi_gquotaip) { + IRELE(mp->m_quotainfo->qi_gquotaip); + mp->m_quotainfo->qi_gquotaip = NULL; + } + if (mp->m_quotainfo->qi_pquotaip) { + IRELE(mp->m_quotainfo->qi_pquotaip); + mp->m_quotainfo->qi_pquotaip = NULL; + } + } +} + +STATIC int +xfs_qm_dqattach_one( + xfs_inode_t *ip, + xfs_dqid_t id, + uint type, + uint doalloc, + xfs_dquot_t **IO_idqpp) +{ + xfs_dquot_t *dqp; + int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + error = 0; + + /* + * See if we already have it in the inode itself. IO_idqpp is &i_udquot + * or &i_gdquot. This made the code look weird, but made the logic a lot + * simpler. + */ + dqp = *IO_idqpp; + if (dqp) { + trace_xfs_dqattach_found(dqp); + return 0; + } + + /* + * Find the dquot from somewhere. This bumps the reference count of + * dquot and returns it locked. This can return ENOENT if dquot didn't + * exist on disk and we didn't ask it to allocate; ESRCH if quotas got + * turned off suddenly. + */ + error = xfs_qm_dqget(ip->i_mount, ip, id, type, + doalloc | XFS_QMOPT_DOWARN, &dqp); + if (error) + return error; + + trace_xfs_dqattach_get(dqp); + + /* + * dqget may have dropped and re-acquired the ilock, but it guarantees + * that the dquot returned is the one that should go in the inode. + */ + *IO_idqpp = dqp; + xfs_dqunlock(dqp); + return 0; +} + +static bool +xfs_qm_need_dqattach( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return false; + if (!XFS_IS_QUOTA_ON(mp)) + return false; + if (!XFS_NOT_DQATTACHED(mp, ip)) + return false; + if (xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) + return false; + return true; +} + +/* + * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON + * into account. + * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * Inode may get unlocked and relocked in here, and the caller must deal with + * the consequences. + */ +int +xfs_qm_dqattach_locked( + xfs_inode_t *ip, + uint flags) +{ + xfs_mount_t *mp = ip->i_mount; + int error = 0; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, + flags & XFS_QMOPT_DQALLOC, + &ip->i_udquot); + if (error) + goto done; + ASSERT(ip->i_udquot); + } + + if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { + error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, + flags & XFS_QMOPT_DQALLOC, + &ip->i_gdquot); + if (error) + goto done; + ASSERT(ip->i_gdquot); + } + + if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { + error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, + flags & XFS_QMOPT_DQALLOC, + &ip->i_pdquot); + if (error) + goto done; + ASSERT(ip->i_pdquot); + } + +done: + /* + * Don't worry about the dquots that we may have attached before any + * error - they'll get detached later if it has not already been done. + */ + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + return error; +} + +int +xfs_qm_dqattach( + struct xfs_inode *ip, + uint flags) +{ + int error; + + if (!xfs_qm_need_dqattach(ip)) + return 0; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqattach_locked(ip, flags); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return error; +} + +/* + * Release dquots (and their references) if any. + * The inode should be locked EXCL except when this's called by + * xfs_ireclaim. + */ +void +xfs_qm_dqdetach( + xfs_inode_t *ip) +{ + if (!(ip->i_udquot || ip->i_gdquot || ip->i_pdquot)) + return; + + trace_xfs_dquot_dqdetach(ip); + + ASSERT(!xfs_is_quota_inode(&ip->i_mount->m_sb, ip->i_ino)); + if (ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if (ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if (ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } +} + +struct xfs_qm_isolate { + struct list_head buffers; + struct list_head dispose; +}; + +static enum lru_status +xfs_qm_dquot_isolate( + struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct xfs_dquot *dqp = container_of(item, + struct xfs_dquot, q_lru); + struct xfs_qm_isolate *isol = arg; + + if (!xfs_dqlock_nowait(dqp)) + goto out_miss_busy; + + /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ + if (dqp->q_nrefs) { + xfs_dqunlock(dqp); + XFS_STATS_INC(xs_qm_dqwants); + + trace_xfs_dqreclaim_want(dqp); + list_del_init(&dqp->q_lru); + XFS_STATS_DEC(xs_qm_dquot_unused); + return LRU_REMOVED; + } + + /* + * If the dquot is dirty, flush it. If it's already being flushed, just + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ + if (!xfs_dqflock_nowait(dqp)) { + xfs_dqunlock(dqp); + goto out_miss_busy; + } + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; + int error; + + trace_xfs_dqreclaim_dirty(dqp); + + /* we have to drop the LRU lock to flush the dquot */ + spin_unlock(lru_lock); + + error = xfs_qm_dqflush(dqp, &bp); + if (error) { + xfs_warn(dqp->q_mount, "%s: dquot %p flush failed", + __func__, dqp); + goto out_unlock_dirty; + } + + xfs_buf_delwri_queue(bp, &isol->buffers); + xfs_buf_relse(bp); + goto out_unlock_dirty; + } + xfs_dqfunlock(dqp); + + /* + * Prevent lookups now that we are past the point of no return. + */ + dqp->dq_flags |= XFS_DQ_FREEING; + xfs_dqunlock(dqp); + + ASSERT(dqp->q_nrefs == 0); + list_move_tail(&dqp->q_lru, &isol->dispose); + XFS_STATS_DEC(xs_qm_dquot_unused); + trace_xfs_dqreclaim_done(dqp); + XFS_STATS_INC(xs_qm_dqreclaims); + return LRU_REMOVED; + +out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + return LRU_SKIP; + +out_unlock_dirty: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(xs_qm_dqreclaim_misses); + xfs_dqunlock(dqp); + spin_lock(lru_lock); + return LRU_RETRY; +} + +static unsigned long +xfs_qm_shrink_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + struct xfs_qm_isolate isol; + unsigned long freed; + int error; + unsigned long nr_to_scan = sc->nr_to_scan; + + if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) + return 0; + + INIT_LIST_HEAD(&isol.buffers); + INIT_LIST_HEAD(&isol.dispose); + + freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, + &nr_to_scan); + + error = xfs_buf_delwri_submit(&isol.buffers); + if (error) + xfs_warn(NULL, "%s: dquot reclaim failed", __func__); + + while (!list_empty(&isol.dispose)) { + struct xfs_dquot *dqp; + + dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru); + list_del_init(&dqp->q_lru); + xfs_qm_dqfree_one(dqp); + } + + return freed; +} + +static unsigned long +xfs_qm_shrink_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_quotainfo *qi = container_of(shrink, + struct xfs_quotainfo, qi_shrinker); + + return list_lru_count_node(&qi->qi_lru, sc->nid); +} + +/* + * This initializes all the quota information that's kept in the + * mount structure + */ +STATIC int +xfs_qm_init_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qinf; + int error; + xfs_dquot_t *dqp; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); + + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; + + /* + * See if quotainodes are setup, and if not, allocate them, + * and change the superblock accordingly. + */ + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; + + INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); + INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS); + mutex_init(&qinf->qi_tree_lock); + + /* mutex used to serialize quotaoffs */ + mutex_init(&qinf->qi_quotaofflock); + + /* Precalc some constants */ + qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); + + mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * We look at the USR dquot with id == 0 first, but if user quotas + * are not enabled we goto the GRP dquot with id == 0. + * We don't really care to keep separate default limits for user + * and group quotas, at least not at this point. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + */ + error = xfs_qm_dqread(mp, 0, + XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : + (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : + XFS_DQ_PROJ), + XFS_QMOPT_DOWARN, &dqp); + if (!error) { + xfs_disk_dquot_t *ddqp = &dqp->q_core; + + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + qinf->qi_btimelimit = ddqp->d_btimer ? + be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = ddqp->d_itimer ? + be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? + be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = ddqp->d_bwarns ? + be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = ddqp->d_iwarns ? + be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? + be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; + qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + + xfs_qm_dqdestroy(dqp); + } else { + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + } + + qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker.seeks = DEFAULT_SEEKS; + qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; + register_shrinker(&qinf->qi_shrinker); + return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; +} + + +/* + * Gets called when unmounting a filesystem or when all quotas get + * turned off. + * This purges the quota inodes, destroys locks and frees itself. + */ +void +xfs_qm_destroy_quotainfo( + xfs_mount_t *mp) +{ + xfs_quotainfo_t *qi; + + qi = mp->m_quotainfo; + ASSERT(qi != NULL); + + unregister_shrinker(&qi->qi_shrinker); + list_lru_destroy(&qi->qi_lru); + + if (qi->qi_uquotaip) { + IRELE(qi->qi_uquotaip); + qi->qi_uquotaip = NULL; /* paranoia */ + } + if (qi->qi_gquotaip) { + IRELE(qi->qi_gquotaip); + qi->qi_gquotaip = NULL; + } + if (qi->qi_pquotaip) { + IRELE(qi->qi_pquotaip); + qi->qi_pquotaip = NULL; + } + mutex_destroy(&qi->qi_quotaofflock); + kmem_free(qi); + mp->m_quotainfo = NULL; +} + +/* + * Create an inode and return with a reference already taken, but unlocked + * This is how we create quota inodes + */ +STATIC int +xfs_qm_qino_alloc( + xfs_mount_t *mp, + xfs_inode_t **ip, + __int64_t sbfields, + uint flags) +{ + xfs_trans_t *tp; + int error; + int committed; + + *ip = NULL; + /* + * With superblock that doesn't have separate pquotino, we + * share an inode between gquota and pquota. If the on-disk + * superblock has GQUOTA and the filesystem is now mounted + * with PQUOTA, just use sb_gquotino for sb_pquotino and + * vice-versa. + */ + if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { + xfs_ino_t ino = NULLFSINO; + + if ((flags & XFS_QMOPT_PQUOTA) && + (mp->m_sb.sb_gquotino != NULLFSINO)) { + ino = mp->m_sb.sb_gquotino; + ASSERT(mp->m_sb.sb_pquotino == NULLFSINO); + } else if ((flags & XFS_QMOPT_GQUOTA) && + (mp->m_sb.sb_pquotino != NULLFSINO)) { + ino = mp->m_sb.sb_pquotino; + ASSERT(mp->m_sb.sb_gquotino == NULLFSINO); + } + if (ino != NULLFSINO) { + error = xfs_iget(mp, NULL, ino, 0, 0, ip); + if (error) + return error; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + } + } + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, + XFS_QM_QINOCREATE_SPACE_RES(mp), 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + if (!*ip) { + error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, + &committed); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + return error; + } + } + + /* + * Make the changes in the superblock, and log those too. + * sbfields arg may contain fields other than *QUOTINO; + * VERSIONNUM for example. + */ + spin_lock(&mp->m_sb_lock); + if (flags & XFS_QMOPT_SBVERSION) { + ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) == + (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS)); + + xfs_sb_version_addquota(&mp->m_sb); + mp->m_sb.sb_uquotino = NULLFSINO; + mp->m_sb.sb_gquotino = NULLFSINO; + mp->m_sb.sb_pquotino = NULLFSINO; + + /* qflags will get updated fully _after_ quotacheck */ + mp->m_sb.sb_qflags = mp->m_qflags & XFS_ALL_QUOTA_ACCT; + } + if (flags & XFS_QMOPT_UQUOTA) + mp->m_sb.sb_uquotino = (*ip)->i_ino; + else if (flags & XFS_QMOPT_GQUOTA) + mp->m_sb.sb_gquotino = (*ip)->i_ino; + else + mp->m_sb.sb_pquotino = (*ip)->i_ino; + spin_unlock(&mp->m_sb_lock); + xfs_mod_sb(tp, sbfields); + + if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + xfs_alert(mp, "%s failed (error %d)!", __func__, error); + return error; + } + return 0; +} + + +STATIC void +xfs_qm_reset_dqcounts( + xfs_mount_t *mp, + xfs_buf_t *bp, + xfs_dqid_t id, + uint type) +{ + struct xfs_dqblk *dqb; + int j; + + trace_xfs_reset_dqcounts(bp, _RET_IP_); + + /* + * Reset all counters and timers. They'll be + * started afresh by xfs_qm_quotacheck. + */ +#ifdef DEBUG + j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); + do_div(j, sizeof(xfs_dqblk_t)); + ASSERT(mp->m_quotainfo->qi_dqperchunk == j); +#endif + dqb = bp->b_addr; + for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + + /* + * Do a sanity check, and if needed, repair the dqblk. Don't + * output any warnings because it's perfectly possible to + * find uninitialised dquot blks. See comment in xfs_dqcheck. + */ + xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR, + "xfs_quotacheck"); + ddq->d_bcount = 0; + ddq->d_icount = 0; + ddq->d_rtbcount = 0; + ddq->d_btimer = 0; + ddq->d_itimer = 0; + ddq->d_rtbtimer = 0; + ddq->d_bwarns = 0; + ddq->d_iwarns = 0; + ddq->d_rtbwarns = 0; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + } +} + +STATIC int +xfs_qm_dqiter_bufs( + struct xfs_mount *mp, + xfs_dqid_t firstid, + xfs_fsblock_t bno, + xfs_filblks_t blkcnt, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_buf *bp; + int error; + int type; + + ASSERT(blkcnt > 0); + type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : + (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); + error = 0; + + /* + * Blkcnt arg can be a very big number, and might even be + * larger than the log itself. So, we have to break it up into + * manageable-sized transactions. + * Note that we don't start a permanent transaction here; we might + * not be able to get a log reservation for the whole thing up front, + * and we don't really care to either, because we just discard + * everything if we were to crash in the middle of this loop. + */ + while (blkcnt--) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + + /* + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. + */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + + xfs_qm_reset_dqcounts(mp, bp, firstid, type); + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); + + /* goto the next block. */ + bno++; + firstid += mp->m_quotainfo->qi_dqperchunk; + } + + return error; +} + +/* + * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a + * caller supplied function for every chunk of dquots that we find. + */ +STATIC int +xfs_qm_dqiterate( + struct xfs_mount *mp, + struct xfs_inode *qip, + uint flags, + struct list_head *buffer_list) +{ + struct xfs_bmbt_irec *map; + int i, nmaps; /* number of map entries */ + int error; /* return value */ + xfs_fileoff_t lblkno; + xfs_filblks_t maxlblkcnt; + xfs_dqid_t firstid; + xfs_fsblock_t rablkno; + xfs_filblks_t rablkcnt; + + error = 0; + /* + * This looks racy, but we can't keep an inode lock across a + * trans_reserve. But, this gets called during quotacheck, and that + * happens only at mount time which is single threaded. + */ + if (qip->i_d.di_nblocks == 0) + return 0; + + map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP); + + lblkno = 0; + maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); + do { + uint lock_mode; + + nmaps = XFS_DQITER_MAP_SIZE; + /* + * We aren't changing the inode itself. Just changing + * some of its data. No new blocks are added here, and + * the inode is never added to the transaction. + */ + lock_mode = xfs_ilock_data_map_shared(qip); + error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno, + map, &nmaps, 0); + xfs_iunlock(qip, lock_mode); + if (error) + break; + + ASSERT(nmaps <= XFS_DQITER_MAP_SIZE); + for (i = 0; i < nmaps; i++) { + ASSERT(map[i].br_startblock != DELAYSTARTBLOCK); + ASSERT(map[i].br_blockcount); + + + lblkno += map[i].br_blockcount; + + if (map[i].br_startblock == HOLESTARTBLOCK) + continue; + + firstid = (xfs_dqid_t) map[i].br_startoff * + mp->m_quotainfo->qi_dqperchunk; + /* + * Do a read-ahead on the next extent. + */ + if ((i+1 < nmaps) && + (map[i+1].br_startblock != HOLESTARTBLOCK)) { + rablkcnt = map[i+1].br_blockcount; + rablkno = map[i+1].br_startblock; + while (rablkcnt--) { + xfs_buf_readahead(mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, rablkno), + mp->m_quotainfo->qi_dqchunklen, + NULL); + rablkno++; + } + } + /* + * Iterate thru all the blks in the extent and + * reset the counters of all the dquots inside them. + */ + error = xfs_qm_dqiter_bufs(mp, firstid, + map[i].br_startblock, + map[i].br_blockcount, + flags, buffer_list); + if (error) + goto out; + } + } while (nmaps > 0); + +out: + kmem_free(map); + return error; +} + +/* + * Called by dqusage_adjust in doing a quotacheck. + * + * Given the inode, and a dquot id this updates both the incore dqout as well + * as the buffer copy. This is so that once the quotacheck is done, we can + * just log all the buffers, as opposed to logging numerous updates to + * individual dquots. + */ +STATIC int +xfs_qm_quotacheck_dqadjust( + struct xfs_inode *ip, + xfs_dqid_t id, + uint type, + xfs_qcnt_t nblks, + xfs_qcnt_t rtblks) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget(mp, ip, id, type, + XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp); + if (error) { + /* + * Shouldn't be able to turn off quotas here. + */ + ASSERT(error != ESRCH); + ASSERT(error != ENOENT); + return error; + } + + trace_xfs_dqadjust(dqp); + + /* + * Adjust the inode count and the block count to reflect this inode's + * resource usage. + */ + be64_add_cpu(&dqp->q_core.d_icount, 1); + dqp->q_res_icount++; + if (nblks) { + be64_add_cpu(&dqp->q_core.d_bcount, nblks); + dqp->q_res_bcount += nblks; + } + if (rtblks) { + be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks); + dqp->q_res_rtbcount += rtblks; + } + + /* + * Set default limits, adjust timers (since we changed usages) + * + * There are no timers for the default values set in the root dquot. + */ + if (dqp->q_core.d_id) { + xfs_qm_adjust_dqlimits(mp, dqp); + xfs_qm_adjust_dqtimers(mp, &dqp->q_core); + } + + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_qm_dqput(dqp); + return 0; +} + +STATIC int +xfs_qm_get_rtblks( + xfs_inode_t *ip, + xfs_qcnt_t *O_rtblks) +{ + xfs_filblks_t rtblks; /* total rt blks */ + xfs_extnum_t idx; /* extent record index */ + xfs_ifork_t *ifp; /* inode fork pointer */ + xfs_extnum_t nextents; /* number of extent entries */ + int error; + + ASSERT(XFS_IS_REALTIME_INODE(ip)); + ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) + return error; + } + rtblks = 0; + nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + for (idx = 0; idx < nextents; idx++) + rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); + *O_rtblks = (xfs_qcnt_t)rtblks; + return 0; +} + +/* + * callback routine supplied to bulkstat(). Given an inumber, find its + * dquots and update them to account for resources taken by that inode. + */ +/* ARGSUSED */ +STATIC int +xfs_qm_dqusage_adjust( + xfs_mount_t *mp, /* mount point for filesystem */ + xfs_ino_t ino, /* inode number to get data for */ + void __user *buffer, /* not used */ + int ubsize, /* not used */ + int *ubused, /* not used */ + int *res) /* result code value */ +{ + xfs_inode_t *ip; + xfs_qcnt_t nblks, rtblks = 0; + int error; + + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + /* + * rootino must have its resources accounted for, not so with the quota + * inodes. + */ + if (xfs_is_quota_inode(&mp->m_sb, ino)) { + *res = BULKSTAT_RV_NOTHING; + return XFS_ERROR(EINVAL); + } + + /* + * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget + * interface expects the inode to be exclusively locked because that's + * the case in all other instances. It's OK that we do this because + * quotacheck is done only at mount time. + */ + error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + if (error) { + *res = BULKSTAT_RV_NOTHING; + return error; + } + + ASSERT(ip->i_delayed_blks == 0); + + if (XFS_IS_REALTIME_INODE(ip)) { + /* + * Walk thru the extent list and count the realtime blocks. + */ + error = xfs_qm_get_rtblks(ip, &rtblks); + if (error) + goto error0; + } + + nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; + + /* + * Add the (disk blocks and inode) resources occupied by this + * inode to its dquots. We do this adjustment in the incore dquot, + * and also copy the changes to its buffer. + * We don't care about putting these changes in a transaction + * envelope because if we crash in the middle of a 'quotacheck' + * we have to start from the beginning anyway. + * Once we're done, we'll log all the dquot bufs. + * + * The *QUOTA_ON checks below may look pretty racy, but quotachecks + * and quotaoffs don't race. (Quotachecks happen at mount time only). + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, + XFS_DQ_USER, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_GQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, + XFS_DQ_GROUP, nblks, rtblks); + if (error) + goto error0; + } + + if (XFS_IS_PQUOTA_ON(mp)) { + error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), + XFS_DQ_PROJ, nblks, rtblks); + if (error) + goto error0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_DIDONE; + return 0; + +error0: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + IRELE(ip); + *res = BULKSTAT_RV_GIVEUP; + return error; +} + +STATIC int +xfs_qm_flush_one( + struct xfs_dquot *dqp, + void *data) +{ + struct list_head *buffer_list = data; + struct xfs_buf *bp = NULL; + int error = 0; + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) + goto out_unlock; + if (!XFS_DQ_IS_DIRTY(dqp)) + goto out_unlock; + + xfs_dqflock(dqp); + error = xfs_qm_dqflush(dqp, &bp); + if (error) + goto out_unlock; + + xfs_buf_delwri_queue(bp, buffer_list); + xfs_buf_relse(bp); +out_unlock: + xfs_dqunlock(dqp); + return error; +} + +/* + * Walk thru all the filesystem inodes and construct a consistent view + * of the disk quota world. If the quotacheck fails, disable quotas. + */ +int +xfs_qm_quotacheck( + xfs_mount_t *mp) +{ + int done, count, error, error2; + xfs_ino_t lastino; + size_t structsz; + uint flags; + LIST_HEAD (buffer_list); + struct xfs_inode *uip = mp->m_quotainfo->qi_uquotaip; + struct xfs_inode *gip = mp->m_quotainfo->qi_gquotaip; + struct xfs_inode *pip = mp->m_quotainfo->qi_pquotaip; + + count = INT_MAX; + structsz = 1; + lastino = 0; + flags = 0; + + ASSERT(uip || gip || pip); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + xfs_notice(mp, "Quotacheck needed: Please wait."); + + /* + * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset + * their counters to zero. We need a clean slate. + * We don't log our changes till later. + */ + if (uip) { + error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_UQUOTA_CHKD; + } + + if (gip) { + error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_GQUOTA_CHKD; + } + + if (pip) { + error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + &buffer_list); + if (error) + goto error_return; + flags |= XFS_PQUOTA_CHKD; + } + + do { + /* + * Iterate thru all the inodes in the file system, + * adjusting the corresponding dquot counters in core. + */ + error = xfs_bulkstat(mp, &lastino, &count, + xfs_qm_dqusage_adjust, + structsz, NULL, &done); + if (error) + break; + + } while (!done); + + /* + * We've made all the changes that we need to make incore. Flush them + * down to disk buffers if everything was updated successfully. + */ + if (XFS_IS_UQUOTA_ON(mp)) { + error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one, + &buffer_list); + } + if (XFS_IS_GQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + if (XFS_IS_PQUOTA_ON(mp)) { + error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one, + &buffer_list); + if (!error) + error = error2; + } + + error2 = xfs_buf_delwri_submit(&buffer_list); + if (!error) + error = error2; + + /* + * We can get this error if we couldn't do a dquot allocation inside + * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the + * dirty dquots that might be cached, we just want to get rid of them + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ + if (error) { + xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + goto error_return; + } + + /* + * If one type of quotas is off, then it will lose its + * quotachecked status, since we won't be doing accounting for + * that type anymore. + */ + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; + mp->m_qflags |= flags; + + error_return: + while (!list_empty(&buffer_list)) { + struct xfs_buf *bp = + list_first_entry(&buffer_list, struct xfs_buf, b_list); + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } + + if (error) { + xfs_warn(mp, + "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", + error); + /* + * We must turn off quotas. + */ + ASSERT(mp->m_quotainfo != NULL); + xfs_qm_destroy_quotainfo(mp); + if (xfs_mount_reset_sbqflags(mp)) { + xfs_warn(mp, + "Quotacheck: Failed to reset quota flags."); + } + } else + xfs_notice(mp, "Quotacheck: Done."); + return (error); +} + +/* + * This is called after the superblock has been read in and we're ready to + * iget the quota inodes. + */ +STATIC int +xfs_qm_init_quotainos( + xfs_mount_t *mp) +{ + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + int error; + __int64_t sbflags = 0; + uint flags = 0; + + ASSERT(mp->m_quotainfo); + + /* + * Get the uquota and gquota inodes + */ + if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (XFS_IS_UQUOTA_ON(mp) && + mp->m_sb.sb_uquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_uquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip); + if (error) + return XFS_ERROR(error); + } + if (XFS_IS_GQUOTA_ON(mp) && + mp->m_sb.sb_gquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_gquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip); + if (error) + goto error_rele; + } + if (XFS_IS_PQUOTA_ON(mp) && + mp->m_sb.sb_pquotino != NULLFSINO) { + ASSERT(mp->m_sb.sb_pquotino > 0); + error = xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip); + if (error) + goto error_rele; + } + } else { + flags |= XFS_QMOPT_SBVERSION; + sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | + XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | + XFS_SB_QFLAGS); + } + + /* + * Create the three inodes, if they don't exist already. The changes + * made above will get added to a transaction and logged in one of + * the qino_alloc calls below. If the device is readonly, + * temporarily switch to read-write to do this. + */ + if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) { + error = xfs_qm_qino_alloc(mp, &uip, + sbflags | XFS_SB_UQUOTINO, + flags | XFS_QMOPT_UQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) { + error = xfs_qm_qino_alloc(mp, &gip, + sbflags | XFS_SB_GQUOTINO, + flags | XFS_QMOPT_GQUOTA); + if (error) + goto error_rele; + + flags &= ~XFS_QMOPT_SBVERSION; + } + if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) { + error = xfs_qm_qino_alloc(mp, &pip, + sbflags | XFS_SB_PQUOTINO, + flags | XFS_QMOPT_PQUOTA); + if (error) + goto error_rele; + } + + mp->m_quotainfo->qi_uquotaip = uip; + mp->m_quotainfo->qi_gquotaip = gip; + mp->m_quotainfo->qi_pquotaip = pip; + + return 0; + +error_rele: + if (uip) + IRELE(uip); + if (gip) + IRELE(gip); + if (pip) + IRELE(pip); + return XFS_ERROR(error); +} + +STATIC void +xfs_qm_dqfree_one( + struct xfs_dquot *dqp) +{ + struct xfs_mount *mp = dqp->q_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + + mutex_lock(&qi->qi_tree_lock); + radix_tree_delete(xfs_dquot_tree(qi, dqp->q_core.d_flags), + be32_to_cpu(dqp->q_core.d_id)); + + qi->qi_dquots--; + mutex_unlock(&qi->qi_tree_lock); + + xfs_qm_dqdestroy(dqp); +} + +/* + * Start a transaction and write the incore superblock changes to + * disk. flags parameter indicates which fields have changed. + */ +int +xfs_qm_write_sb_changes( + xfs_mount_t *mp, + __int64_t flags) +{ + xfs_trans_t *tp; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_mod_sb(tp, flags); + error = xfs_trans_commit(tp, 0); + + return error; +} + + +/* --------------- utility functions for vnodeops ---------------- */ + + +/* + * Given an inode, a uid, gid and prid make sure that we have + * allocated relevant dquot(s) on disk, and that we won't exceed inode + * quotas by creating this file. + * This also attaches dquot(s) to the given inode after locking it, + * and returns the dquots corresponding to the uid and/or gid. + * + * in : inode (unlocked) + * out : udquot, gdquot with references taken and unlocked + */ +int +xfs_qm_vop_dqalloc( + struct xfs_inode *ip, + xfs_dqid_t uid, + xfs_dqid_t gid, + prid_t prid, + uint flags, + struct xfs_dquot **O_udqpp, + struct xfs_dquot **O_gdqpp, + struct xfs_dquot **O_pdqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_dquot *uq = NULL; + struct xfs_dquot *gq = NULL; + struct xfs_dquot *pq = NULL; + int error; + uint lockflags; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + lockflags = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockflags); + + if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip)) + gid = ip->i_d.di_gid; + + /* + * Attach the dquot(s) to this inode, doing a dquot allocation + * if necessary. The dquot(s) will not be locked. + */ + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + if (error) { + xfs_iunlock(ip, lockflags); + return error; + } + } + + if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + if (ip->i_d.di_uid != uid) { + /* + * What we need is the dquot that has this uid, and + * if we send the inode to dqget, the uid of the inode + * takes priority over what's sent in the uid argument. + * We must unlock inode here before calling dqget if + * we're not sending the inode, because otherwise + * we'll deadlock by doing trans_reserve while + * holding ilock. + */ + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, uid, + XFS_DQ_USER, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &uq); + if (error) { + ASSERT(error != ENOENT); + return error; + } + /* + * Get the ilock in the right order. + */ + xfs_dqunlock(uq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + /* + * Take an extra reference, because we'll return + * this to caller + */ + ASSERT(ip->i_udquot); + uq = xfs_qm_dqhold(ip->i_udquot); + } + } + if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + if (ip->i_d.di_gid != gid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, gid, + XFS_DQ_GROUP, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &gq); + if (error) { + ASSERT(error != ENOENT); + goto error_rele; + } + xfs_dqunlock(gq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_gdquot); + gq = xfs_qm_dqhold(ip->i_gdquot); + } + } + if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + if (xfs_get_projid(ip) != prid) { + xfs_iunlock(ip, lockflags); + error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, + XFS_DQ_PROJ, + XFS_QMOPT_DQALLOC | + XFS_QMOPT_DOWARN, + &pq); + if (error) { + ASSERT(error != ENOENT); + goto error_rele; + } + xfs_dqunlock(pq); + lockflags = XFS_ILOCK_SHARED; + xfs_ilock(ip, lockflags); + } else { + ASSERT(ip->i_pdquot); + pq = xfs_qm_dqhold(ip->i_pdquot); + } + } + if (uq) + trace_xfs_dquot_dqalloc(ip); + + xfs_iunlock(ip, lockflags); + if (O_udqpp) + *O_udqpp = uq; + else if (uq) + xfs_qm_dqrele(uq); + if (O_gdqpp) + *O_gdqpp = gq; + else if (gq) + xfs_qm_dqrele(gq); + if (O_pdqpp) + *O_pdqpp = pq; + else if (pq) + xfs_qm_dqrele(pq); + return 0; + +error_rele: + if (gq) + xfs_qm_dqrele(gq); + if (uq) + xfs_qm_dqrele(uq); + return error; +} + +/* + * Actually transfer ownership, and do dquot modifications. + * These were already reserved. + */ +xfs_dquot_t * +xfs_qm_vop_chown( + xfs_trans_t *tp, + xfs_inode_t *ip, + xfs_dquot_t **IO_olddq, + xfs_dquot_t *newdq) +{ + xfs_dquot_t *prevdq; + uint bfield = XFS_IS_REALTIME_INODE(ip) ? + XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + + /* old dquot */ + prevdq = *IO_olddq; + ASSERT(prevdq); + ASSERT(prevdq != newdq); + + xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks)); + xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1); + + /* the sparkling new dquot */ + xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks); + xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1); + + /* + * Take an extra reference, because the inode is going to keep + * this dquot pointer even after the trans_commit. + */ + *IO_olddq = xfs_qm_dqhold(newdq); + + return prevdq; +} + +/* + * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID). + */ +int +xfs_qm_vop_chown_reserve( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + uint flags) +{ + struct xfs_mount *mp = ip->i_mount; + uint delblks, blkflags, prjflags = 0; + struct xfs_dquot *udq_unres = NULL; + struct xfs_dquot *gdq_unres = NULL; + struct xfs_dquot *pdq_unres = NULL; + struct xfs_dquot *udq_delblks = NULL; + struct xfs_dquot *gdq_delblks = NULL; + struct xfs_dquot *pdq_delblks = NULL; + int error; + + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + delblks = ip->i_delayed_blks; + blkflags = XFS_IS_REALTIME_INODE(ip) ? + XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS; + + if (XFS_IS_UQUOTA_ON(mp) && udqp && + ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) { + udq_delblks = udqp; + /* + * If there are delayed allocation blocks, then we have to + * unreserve those from the old dquot, and add them to the + * new dquot. + */ + if (delblks) { + ASSERT(ip->i_udquot); + udq_unres = ip->i_udquot; + } + } + if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) { + gdq_delblks = gdqp; + if (delblks) { + ASSERT(ip->i_gdquot); + gdq_unres = ip->i_gdquot; + } + } + + if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp && + xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) { + prjflags = XFS_QMOPT_ENOSPC; + pdq_delblks = pdqp; + if (delblks) { + ASSERT(ip->i_pdquot); + pdq_unres = ip->i_pdquot; + } + } + + error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + ip->i_d.di_nblocks, 1, + flags | blkflags | prjflags); + if (error) + return error; + + /* + * Do the delayed blks reservations/unreservations now. Since, these + * are done without the help of a transaction, if a reservation fails + * its previous reservations won't be automatically undone by trans + * code. So, we have to do it manually here. + */ + if (delblks) { + /* + * Do the reservations first. Unreservation can't fail. + */ + ASSERT(udq_delblks || gdq_delblks || pdq_delblks); + ASSERT(udq_unres || gdq_unres || pdq_unres); + error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_delblks, gdq_delblks, pdq_delblks, + (xfs_qcnt_t)delblks, 0, + flags | blkflags | prjflags); + if (error) + return error; + xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, + udq_unres, gdq_unres, pdq_unres, + -((xfs_qcnt_t)delblks), 0, blkflags); + } + + return (0); +} + +int +xfs_qm_vop_rename_dqattach( + struct xfs_inode **i_tab) +{ + struct xfs_mount *mp = i_tab[0]->i_mount; + int i; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return 0; + + for (i = 0; (i < 4 && i_tab[i]); i++) { + struct xfs_inode *ip = i_tab[i]; + int error; + + /* + * Watch out for duplicate entries in the table. + */ + if (i == 0 || ip != i_tab[i-1]) { + if (XFS_NOT_DQATTACHED(mp, ip)) { + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + } + } + } + return 0; +} + +void +xfs_qm_vop_create_dqattach( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + return; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + + if (udqp && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id)); + + ip->i_udquot = xfs_qm_dqhold(udqp); + xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (gdqp && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id)); + ip->i_gdquot = xfs_qm_dqhold(gdqp); + xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1); + } + if (pdqp && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(ip->i_pdquot == NULL); + ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id)); + + ip->i_pdquot = xfs_qm_dqhold(pdqp); + xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1); + } +} + diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h new file mode 100644 index 00000000000..797fd463627 --- /dev/null +++ b/fs/xfs/xfs_qm.h @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QM_H__ +#define __XFS_QM_H__ + +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" + +struct xfs_inode; + +extern struct kmem_zone *xfs_qm_dqtrxzone; + +/* + * Number of bmaps that we ask from bmapi when doing a quotacheck. + * We make this restriction to keep the memory usage to a minimum. + */ +#define XFS_DQITER_MAP_SIZE 10 + +#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \ + !dqp->q_core.d_blk_hardlimit && \ + !dqp->q_core.d_blk_softlimit && \ + !dqp->q_core.d_rtb_hardlimit && \ + !dqp->q_core.d_rtb_softlimit && \ + !dqp->q_core.d_ino_hardlimit && \ + !dqp->q_core.d_ino_softlimit && \ + !dqp->q_core.d_bcount && \ + !dqp->q_core.d_rtbcount && \ + !dqp->q_core.d_icount) + +/* + * This defines the unit of allocation of dquots. + * Currently, it is just one file system block, and a 4K blk contains 30 + * (136 * 30 = 4080) dquots. It's probably not worth trying to make + * this more dynamic. + * XXXsup However, if this number is changed, we have to make sure that we don't + * implicitly assume that we do allocations in chunks of a single filesystem + * block in the dquot/xqm code. + */ +#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 + +/* + * Various quota information for individual filesystems. + * The mount structure keeps a pointer to this. + */ +typedef struct xfs_quotainfo { + struct radix_tree_root qi_uquota_tree; + struct radix_tree_root qi_gquota_tree; + struct radix_tree_root qi_pquota_tree; + struct mutex qi_tree_lock; + struct xfs_inode *qi_uquotaip; /* user quota inode */ + struct xfs_inode *qi_gquotaip; /* group quota inode */ + struct xfs_inode *qi_pquotaip; /* project quota inode */ + struct list_lru qi_lru; + int qi_dquots; + time_t qi_btimelimit; /* limit for blks timer */ + time_t qi_itimelimit; /* limit for inodes timer */ + time_t qi_rtbtimelimit;/* limit for rt blks timer */ + xfs_qwarncnt_t qi_bwarnlimit; /* limit for blks warnings */ + xfs_qwarncnt_t qi_iwarnlimit; /* limit for inodes warnings */ + xfs_qwarncnt_t qi_rtbwarnlimit;/* limit for rt blks warnings */ + struct mutex qi_quotaofflock;/* to serialize quotaoff */ + xfs_filblks_t qi_dqchunklen; /* # BBs in a chunk of dqs */ + uint qi_dqperchunk; /* # ondisk dqs in above chunk */ + xfs_qcnt_t qi_bhardlimit; /* default data blk hard limit */ + xfs_qcnt_t qi_bsoftlimit; /* default data blk soft limit */ + xfs_qcnt_t qi_ihardlimit; /* default inode count hard limit */ + xfs_qcnt_t qi_isoftlimit; /* default inode count soft limit */ + xfs_qcnt_t qi_rtbhardlimit;/* default realtime blk hard limit */ + xfs_qcnt_t qi_rtbsoftlimit;/* default realtime blk soft limit */ + struct shrinker qi_shrinker; +} xfs_quotainfo_t; + +static inline struct radix_tree_root * +xfs_dquot_tree( + struct xfs_quotainfo *qi, + int type) +{ + switch (type) { + case XFS_DQ_USER: + return &qi->qi_uquota_tree; + case XFS_DQ_GROUP: + return &qi->qi_gquota_tree; + case XFS_DQ_PROJ: + return &qi->qi_pquota_tree; + default: + ASSERT(0); + } + return NULL; +} + +static inline struct xfs_inode * +xfs_dq_to_quota_inode(struct xfs_dquot *dqp) +{ + switch (dqp->dq_flags & XFS_DQ_ALLTYPES) { + case XFS_DQ_USER: + return dqp->q_mount->m_quotainfo->qi_uquotaip; + case XFS_DQ_GROUP: + return dqp->q_mount->m_quotainfo->qi_gquotaip; + case XFS_DQ_PROJ: + return dqp->q_mount->m_quotainfo->qi_pquotaip; + default: + ASSERT(0); + } + return NULL; +} + +extern void xfs_trans_mod_dquot(struct xfs_trans *, + struct xfs_dquot *, uint, long); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, + long, long, uint); +extern void xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *); +extern void xfs_trans_log_dquot(struct xfs_trans *, struct xfs_dquot *); + +/* + * We keep the usr, grp, and prj dquots separately so that locking will be + * easier to do at commit time. All transactions that we know of at this point + * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value. + */ +enum { + XFS_QM_TRANS_USR = 0, + XFS_QM_TRANS_GRP, + XFS_QM_TRANS_PRJ, + XFS_QM_TRANS_DQTYPES +}; +#define XFS_QM_TRANS_MAXDQS 2 +struct xfs_dquot_acct { + struct xfs_dqtrx dqs[XFS_QM_TRANS_DQTYPES][XFS_QM_TRANS_MAXDQS]; +}; + +/* + * Users are allowed to have a usage exceeding their softlimit for + * a period this long. + */ +#define XFS_QM_BTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_RTBTIMELIMIT (7 * 24*60*60) /* 1 week */ +#define XFS_QM_ITIMELIMIT (7 * 24*60*60) /* 1 week */ + +#define XFS_QM_BWARNLIMIT 5 +#define XFS_QM_IWARNLIMIT 5 +#define XFS_QM_RTBWARNLIMIT 5 + +extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); +extern int xfs_qm_quotacheck(struct xfs_mount *); +extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); + +/* dquot stuff */ +extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); +extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); + +/* quota ops */ +extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct fs_disk_quota *); +extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, + struct fs_disk_quota *); +extern int xfs_qm_scall_getqstat(struct xfs_mount *, + struct fs_quota_stat *); +extern int xfs_qm_scall_getqstatv(struct xfs_mount *, + struct fs_quota_statv *); +extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); +extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); + +#endif /* __XFS_QM_H__ */ diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c new file mode 100644 index 00000000000..e9be63abd8d --- /dev/null +++ b/fs/xfs/xfs_qm_bhv.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_quota.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_qm.h" + + +STATIC void +xfs_fill_statvfs_from_dquot( + struct kstatfs *statp, + struct xfs_dquot *dqp) +{ + __uint64_t limit; + + limit = dqp->q_core.d_blk_softlimit ? + be64_to_cpu(dqp->q_core.d_blk_softlimit) : + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + if (limit && statp->f_blocks > limit) { + statp->f_blocks = limit; + statp->f_bfree = statp->f_bavail = + (statp->f_blocks > dqp->q_res_bcount) ? + (statp->f_blocks - dqp->q_res_bcount) : 0; + } + + limit = dqp->q_core.d_ino_softlimit ? + be64_to_cpu(dqp->q_core.d_ino_softlimit) : + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + if (limit && statp->f_files > limit) { + statp->f_files = limit; + statp->f_ffree = + (statp->f_files > dqp->q_res_icount) ? + (statp->f_ffree - dqp->q_res_icount) : 0; + } +} + + +/* + * Directory tree accounting is implemented using project quotas, where + * the project identifier is inherited from parent directories. + * A statvfs (df, etc.) of a directory that is using project quota should + * return a statvfs of the project, not the entire filesystem. + * This makes such trees appear as if they are filesystems in themselves. + */ +void +xfs_qm_statvfs( + xfs_inode_t *ip, + struct kstatfs *statp) +{ + xfs_mount_t *mp = ip->i_mount; + xfs_dquot_t *dqp; + + if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + xfs_fill_statvfs_from_dquot(statp, dqp); + xfs_qm_dqput(dqp); + } +} + +int +xfs_qm_newmount( + xfs_mount_t *mp, + uint *needquotamount, + uint *quotaflags) +{ + uint quotaondisk; + uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; + + quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); + + if (quotaondisk) { + uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT; + pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT; + gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT; + } + + /* + * If the device itself is read-only, we can't allow + * the user to change the state of quota on the mount - + * this would generate a transaction on the ro device, + * which would lead to an I/O error and shutdown + */ + + if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) || + (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) || + (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || + (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) || + (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) || + (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) && + xfs_dev_is_read_only(mp, "changing quota state")) { + xfs_warn(mp, "please mount with%s%s%s%s.", + (!quotaondisk ? "out quota" : ""), + (uquotaondisk ? " usrquota" : ""), + (gquotaondisk ? " grpquota" : ""), + (pquotaondisk ? " prjquota" : "")); + return XFS_ERROR(EPERM); + } + + if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { + /* + * Call mount_quotas at this point only if we won't have to do + * a quotacheck. + */ + if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { + /* + * If an error occurred, qm_mount_quotas code + * has already disabled quotas. So, just finish + * mounting, and get on with the boring life + * without disk quotas. + */ + xfs_qm_mount_quotas(mp); + } else { + /* + * Clear the quota flags, but remember them. This + * is so that the quota code doesn't get invoked + * before we're ready. This can happen when an + * inode goes inactive and wants to free blocks, + * or via xfs_log_mount_finish. + */ + *needquotamount = true; + *quotaflags = mp->m_qflags; + mp->m_qflags = 0; + } + } + + return 0; +} diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c new file mode 100644 index 00000000000..bbc813caba4 --- /dev/null +++ b/fs/xfs/xfs_qm_syscalls.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/capability.h> + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_trace.h" +#include "xfs_icache.h" + +STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); +STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, + uint); +STATIC uint xfs_qm_export_flags(uint); +STATIC uint xfs_qm_export_qtype_flags(uint); + +/* + * Turn off quota accounting and/or enforcement for all udquots and/or + * gdquots. Called only at unmount time. + * + * This assumes that there are no dquots of this file system cached + * incore, and modifies the ondisk dquot directly. Therefore, for example, + * it is an error to call this twice, without purging the cache. + */ +int +xfs_qm_scall_quotaoff( + xfs_mount_t *mp, + uint flags) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + uint dqtype; + int error; + uint inactivate_flags; + xfs_qoff_logitem_t *qoffstart; + + /* + * No file system can have quotas enabled on disk but not in core. + * Note that quota utilities (like quotaoff) _expect_ + * errno == EEXIST here. + */ + if ((mp->m_qflags & flags) == 0) + return XFS_ERROR(EEXIST); + error = 0; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + + /* + * We don't want to deal with two quotaoffs messing up each other, + * so we're going to serialize it. quotaoff isn't exactly a performance + * critical thing. + * If quotaoff, then we must be dealing with the root filesystem. + */ + ASSERT(q); + mutex_lock(&q->qi_quotaofflock); + + /* + * If we're just turning off quota enforcement, change mp and go. + */ + if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { + mp->m_qflags &= ~(flags); + + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&q->qi_quotaofflock); + + /* XXX what to do if error ? Revert back to old vals incore ? */ + error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); + return (error); + } + + dqtype = 0; + inactivate_flags = 0; + /* + * If accounting is off, we must turn enforcement off, clear the + * quota 'CHKD' certificate to make it known that we have to + * do a quotacheck the next time this quota is turned on. + */ + if (flags & XFS_UQUOTA_ACCT) { + dqtype |= XFS_QMOPT_UQUOTA; + flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); + inactivate_flags |= XFS_UQUOTA_ACTIVE; + } + if (flags & XFS_GQUOTA_ACCT) { + dqtype |= XFS_QMOPT_GQUOTA; + flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); + inactivate_flags |= XFS_GQUOTA_ACTIVE; + } + if (flags & XFS_PQUOTA_ACCT) { + dqtype |= XFS_QMOPT_PQUOTA; + flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); + inactivate_flags |= XFS_PQUOTA_ACTIVE; + } + + /* + * Nothing to do? Don't complain. This happens when we're just + * turning off quota enforcement. + */ + if ((mp->m_qflags & flags) == 0) + goto out_unlock; + + /* + * Write the LI_QUOTAOFF log record, and do SB changes atomically, + * and synchronously. If we fail to write, we should abort the + * operation as it cannot be recovered safely if we crash. + */ + error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); + if (error) + goto out_unlock; + + /* + * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct + * to take care of the race between dqget and quotaoff. We don't take + * any special locks to reset these bits. All processes need to check + * these bits *after* taking inode lock(s) to see if the particular + * quota type is in the process of being turned off. If *ACTIVE, it is + * guaranteed that all dquot structures and all quotainode ptrs will all + * stay valid as long as that inode is kept locked. + * + * There is no turning back after this. + */ + mp->m_qflags &= ~inactivate_flags; + + /* + * Give back all the dquot reference(s) held by inodes. + * Here we go thru every single incore inode in this file system, and + * do a dqrele on the i_udquot/i_gdquot that it may have. + * Essentially, as long as somebody has an inode locked, this guarantees + * that quotas will not be turned off. This is handy because in a + * transaction once we lock the inode(s) and check for quotaon, we can + * depend on the quota inodes (and other things) being valid as long as + * we keep the lock(s). + */ + xfs_qm_dqrele_all_inodes(mp, flags); + + /* + * Next we make the changes in the quota flag in the mount struct. + * This isn't protected by a particular lock directly, because we + * don't want to take a mrlock every time we depend on quotas being on. + */ + mp->m_qflags &= ~flags; + + /* + * Go through all the dquots of this file system and purge them, + * according to what was turned off. + */ + xfs_qm_dqpurge_all(mp, dqtype); + + /* + * Transactions that had started before ACTIVE state bit was cleared + * could have logged many dquots, so they'd have higher LSNs than + * the first QUOTAOFF log record does. If we happen to crash when + * the tail of the log has gone past the QUOTAOFF record, but + * before the last dquot modification, those dquots __will__ + * recover, and that's not good. + * + * So, we have QUOTAOFF start and end logitems; the start + * logitem won't get overwritten until the end logitem appears... + */ + error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags); + if (error) { + /* We're screwed now. Shutdown is the only option. */ + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto out_unlock; + } + + /* + * If all quotas are completely turned off, close shop. + */ + if (mp->m_qflags == 0) { + mutex_unlock(&q->qi_quotaofflock); + xfs_qm_destroy_quotainfo(mp); + return (0); + } + + /* + * Release our quotainode references if we don't need them anymore. + */ + if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { + IRELE(q->qi_uquotaip); + q->qi_uquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { + IRELE(q->qi_gquotaip); + q->qi_gquotaip = NULL; + } + if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { + IRELE(q->qi_pquotaip); + q->qi_pquotaip = NULL; + } + +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_scall_trunc_qfile( + struct xfs_mount *mp, + xfs_ino_t ino) +{ + struct xfs_inode *ip; + struct xfs_trans *tp; + int error; + + if (ino == NULLFSINO) + return 0; + + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); + if (error) + return error; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + goto out_put; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); + if (error) { + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | + XFS_TRANS_ABORT); + goto out_unlock; + } + + ASSERT(ip->i_d.di_nextents == 0); + + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); +out_put: + IRELE(ip); + return error; +} + +int +xfs_qm_scall_trunc_qfiles( + xfs_mount_t *mp, + uint flags) +{ + int error = EINVAL; + + if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + (flags & ~XFS_DQ_ALLTYPES)) { + xfs_debug(mp, "%s: flags=%x m_qflags=%x", + __func__, flags, mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + if (flags & XFS_DQ_USER) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino); + if (error) + return error; + } + if (flags & XFS_DQ_GROUP) { + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino); + if (error) + return error; + } + if (flags & XFS_DQ_PROJ) + error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino); + + return error; +} + +/* + * Switch on (a given) quota enforcement for a filesystem. This takes + * effect immediately. + * (Switching on quota accounting must be done at mount time.) + */ +int +xfs_qm_scall_quotaon( + xfs_mount_t *mp, + uint flags) +{ + int error; + uint qf; + __int64_t sbflags; + + flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); + /* + * Switching on quota accounting must be done at mount time. + */ + flags &= ~(XFS_ALL_QUOTA_ACCT); + + sbflags = 0; + + if (flags == 0) { + xfs_debug(mp, "%s: zero flags, m_qflags=%x", + __func__, mp->m_qflags); + return XFS_ERROR(EINVAL); + } + + /* No fs can turn on quotas with a delayed effect */ + ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0); + + /* + * Can't enforce without accounting. We check the superblock + * qflags here instead of m_qflags because rootfs can have + * quota acct on ondisk without m_qflags' knowing. + */ + if (((flags & XFS_UQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 && + (flags & XFS_UQUOTA_ENFD)) || + ((flags & XFS_GQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && + (flags & XFS_GQUOTA_ENFD)) || + ((flags & XFS_PQUOTA_ACCT) == 0 && + (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 && + (flags & XFS_PQUOTA_ENFD))) { + xfs_debug(mp, + "%s: Can't enforce without acct, flags=%x sbflags=%x", + __func__, flags, mp->m_sb.sb_qflags); + return XFS_ERROR(EINVAL); + } + /* + * If everything's up to-date incore, then don't waste time. + */ + if ((mp->m_qflags & flags) == flags) + return XFS_ERROR(EEXIST); + + /* + * Change sb_qflags on disk but not incore mp->qflags + * if this is the root filesystem. + */ + spin_lock(&mp->m_sb_lock); + qf = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = qf | flags; + spin_unlock(&mp->m_sb_lock); + + /* + * There's nothing to change if it's the same. + */ + if ((qf & flags) == flags && sbflags == 0) + return XFS_ERROR(EEXIST); + sbflags |= XFS_SB_QFLAGS; + + if ((error = xfs_qm_write_sb_changes(mp, sbflags))) + return (error); + /* + * If we aren't trying to switch on quota enforcement, we are done. + */ + if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) != + (mp->m_qflags & XFS_UQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) != + (mp->m_qflags & XFS_PQUOTA_ACCT)) || + ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != + (mp->m_qflags & XFS_GQUOTA_ACCT)) || + (flags & XFS_ALL_QUOTA_ENFD) == 0) + return (0); + + if (! XFS_IS_QUOTA_RUNNING(mp)) + return XFS_ERROR(ESRCH); + + /* + * Switch on quota enforcement in core. + */ + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); + + return (0); +} + + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTAT command. + */ +int +xfs_qm_scall_getqstat( + struct xfs_mount *mp, + struct fs_quota_stat *out) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + bool tempuqip = false; + bool tempgqip = false; + bool temppqip = false; + + memset(out, 0, sizeof(fs_quota_stat_t)); + + out->qs_version = FS_QSTAT_VERSION; + if (!xfs_sb_version_hasquota(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + return (0); + } + + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip) == 0) + tempuqip = true; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip) == 0) + tempgqip = true; + } + /* + * Q_XGETQSTAT doesn't have room for both group and project quotas. + * So, allow the project quota values to be copied out only if + * there is no group quota information available. + */ + if (!gip) { + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; + } + } else + pip = NULL; + if (uip) { + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + IRELE(uip); + } + + if (gip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + IRELE(gip); + } + if (pip) { + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + } + return 0; +} + +/* + * Return quota status information, such as uquota-off, enforcements, etc. + * for Q_XGETQSTATV command, to support separate project quota field. + */ +int +xfs_qm_scall_getqstatv( + struct xfs_mount *mp, + struct fs_quota_statv *out) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_inode *uip = NULL; + struct xfs_inode *gip = NULL; + struct xfs_inode *pip = NULL; + bool tempuqip = false; + bool tempgqip = false; + bool temppqip = false; + + if (!xfs_sb_version_hasquota(&mp->m_sb)) { + out->qs_uquota.qfs_ino = NULLFSINO; + out->qs_gquota.qfs_ino = NULLFSINO; + out->qs_pquota.qfs_ino = NULLFSINO; + return (0); + } + + out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & + (XFS_ALL_QUOTA_ACCT| + XFS_ALL_QUOTA_ENFD)); + out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; + out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; + out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; + + if (q) { + uip = q->qi_uquotaip; + gip = q->qi_gquotaip; + pip = q->qi_pquotaip; + } + if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, + 0, 0, &uip) == 0) + tempuqip = true; + } + if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, + 0, 0, &gip) == 0) + tempgqip = true; + } + if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { + if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, + 0, 0, &pip) == 0) + temppqip = true; + } + if (uip) { + out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; + out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; + if (tempuqip) + IRELE(uip); + } + + if (gip) { + out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; + out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; + if (tempgqip) + IRELE(gip); + } + if (pip) { + out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks; + out->qs_pquota.qfs_nextents = pip->i_d.di_nextents; + if (temppqip) + IRELE(pip); + } + if (q) { + out->qs_incoredqs = q->qi_dquots; + out->qs_btimelimit = q->qi_btimelimit; + out->qs_itimelimit = q->qi_itimelimit; + out->qs_rtbtimelimit = q->qi_rtbtimelimit; + out->qs_bwarnlimit = q->qi_bwarnlimit; + out->qs_iwarnlimit = q->qi_iwarnlimit; + } + return 0; +} + +#define XFS_DQ_MASK \ + (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK) + +/* + * Adjust quota limits, and start/stop timers accordingly. + */ +int +xfs_qm_scall_setqlim( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + fs_disk_quota_t *newlim) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + struct xfs_disk_dquot *ddq; + struct xfs_dquot *dqp; + struct xfs_trans *tp; + int error; + xfs_qcnt_t hard, soft; + + if (newlim->d_fieldmask & ~XFS_DQ_MASK) + return EINVAL; + if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) + return 0; + + /* + * We don't want to race with a quotaoff so take the quotaoff lock. + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. + */ + mutex_lock(&q->qi_quotaofflock); + + /* + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. + */ + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { + ASSERT(error != ENOENT); + goto out_unlock; + } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); + xfs_trans_dqjoin(tp, dqp); + ddq = &dqp->q_core; + + /* + * Make sure that hardlimits are >= soft limits before changing. + */ + hard = (newlim->d_fieldmask & FS_DQ_BHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) : + be64_to_cpu(ddq->d_blk_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) : + be64_to_cpu(ddq->d_blk_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_blk_hardlimit = cpu_to_be64(hard); + ddq->d_blk_softlimit = cpu_to_be64(soft); + xfs_dquot_set_prealloc_limits(dqp); + if (id == 0) { + q->qi_bhardlimit = hard; + q->qi_bsoftlimit = soft; + } + } else { + xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft); + } + hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : + be64_to_cpu(ddq->d_rtb_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ? + (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) : + be64_to_cpu(ddq->d_rtb_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_rtb_hardlimit = cpu_to_be64(hard); + ddq->d_rtb_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_rtbhardlimit = hard; + q->qi_rtbsoftlimit = soft; + } + } else { + xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft); + } + + hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? + (xfs_qcnt_t) newlim->d_ino_hardlimit : + be64_to_cpu(ddq->d_ino_hardlimit); + soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ? + (xfs_qcnt_t) newlim->d_ino_softlimit : + be64_to_cpu(ddq->d_ino_softlimit); + if (hard == 0 || hard >= soft) { + ddq->d_ino_hardlimit = cpu_to_be64(hard); + ddq->d_ino_softlimit = cpu_to_be64(soft); + if (id == 0) { + q->qi_ihardlimit = hard; + q->qi_isoftlimit = soft; + } + } else { + xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft); + } + + /* + * Update warnings counter(s) if requested + */ + if (newlim->d_fieldmask & FS_DQ_BWARNS) + ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns); + if (newlim->d_fieldmask & FS_DQ_IWARNS) + ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns); + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns); + + if (id == 0) { + /* + * Timelimits for the super user set the relative time + * the other users can be over quota for this file system. + * If it is zero a default is used. Ditto for the default + * soft and hard limit values (already done, above), and + * for warnings. + */ + if (newlim->d_fieldmask & FS_DQ_BTIMER) { + q->qi_btimelimit = newlim->d_btimer; + ddq->d_btimer = cpu_to_be32(newlim->d_btimer); + } + if (newlim->d_fieldmask & FS_DQ_ITIMER) { + q->qi_itimelimit = newlim->d_itimer; + ddq->d_itimer = cpu_to_be32(newlim->d_itimer); + } + if (newlim->d_fieldmask & FS_DQ_RTBTIMER) { + q->qi_rtbtimelimit = newlim->d_rtbtimer; + ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer); + } + if (newlim->d_fieldmask & FS_DQ_BWARNS) + q->qi_bwarnlimit = newlim->d_bwarns; + if (newlim->d_fieldmask & FS_DQ_IWARNS) + q->qi_iwarnlimit = newlim->d_iwarns; + if (newlim->d_fieldmask & FS_DQ_RTBWARNS) + q->qi_rtbwarnlimit = newlim->d_rtbwarns; + } else { + /* + * If the user is now over quota, start the timelimit. + * The user will not be 'warned'. + * Note that we keep the timers ticking, whether enforcement + * is on or off. We don't really want to bother with iterating + * over all ondisk dquots and turning the timers on/off. + */ + xfs_qm_adjust_dqtimers(mp, ddq); + } + dqp->dq_flags |= XFS_DQ_DIRTY; + xfs_trans_log_dquot(tp, dqp); + + error = xfs_trans_commit(tp, 0); + +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: + mutex_unlock(&q->qi_quotaofflock); + return error; +} + +STATIC int +xfs_qm_log_quotaoff_end( + xfs_mount_t *mp, + xfs_qoff_logitem_t *startqoff, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END); + + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return (error); + } + + qoffi = xfs_trans_get_qoff_item(tp, startqoff, + flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + return (error); +} + + +STATIC int +xfs_qm_log_quotaoff( + xfs_mount_t *mp, + xfs_qoff_logitem_t **qoffstartp, + uint flags) +{ + xfs_trans_t *tp; + int error; + xfs_qoff_logitem_t *qoffi=NULL; + uint oldsbqflag=0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); + if (error) + goto error0; + + qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); + xfs_trans_log_quotaoff_item(tp, qoffi); + + spin_lock(&mp->m_sb_lock); + oldsbqflag = mp->m_sb.sb_qflags; + mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; + spin_unlock(&mp->m_sb_lock); + + xfs_mod_sb(tp, XFS_SB_QFLAGS); + + /* + * We have to make sure that the transaction is secure on disk before we + * return and actually stop quota accounting. So, make it synchronous. + * We don't care about quotoff's performance. + */ + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp, 0); + +error0: + if (error) { + xfs_trans_cancel(tp, 0); + /* + * No one else is modifying sb_qflags, so this is OK. + * We still hold the quotaofflock. + */ + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = oldsbqflag; + spin_unlock(&mp->m_sb_lock); + } + *qoffstartp = qoffi; + return (error); +} + + +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct fs_disk_quota *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so + * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't + * exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = XFS_ERROR(ENOENT); + goto out_put; + } + + memset(dst, 0, sizeof(*dst)); + dst->d_version = FS_DQUOT_VERSION; + dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags); + dst->d_id = be32_to_cpu(dqp->q_core.d_id); + dst->d_blk_hardlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); + dst->d_blk_softlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit)); + dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); + dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); + dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount); + dst->d_icount = dqp->q_res_icount; + dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer); + dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer); + dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns); + dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns); + dst->d_rtb_hardlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit)); + dst->d_rtb_softlimit = + XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit)); + dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount); + dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer); + dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns); + + /* + * Internally, we don't reset all the timers when quota enforcement + * gets turned off. No need to confuse the user level code, + * so return zeroes in that case. + */ + if ((!XFS_IS_UQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_USER) || + (!XFS_IS_GQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_GROUP) || + (!XFS_IS_PQUOTA_ENFORCED(mp) && + dqp->q_core.d_flags == XFS_DQ_PROJ)) { + dst->d_btimer = 0; + dst->d_itimer = 0; + dst->d_rtbtimer = 0; + } + +#ifdef DEBUG + if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || + (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) || + (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) && + dst->d_id != 0) { + if ((dst->d_bcount > dst->d_blk_softlimit) && + (dst->d_blk_softlimit > 0)) { + ASSERT(dst->d_btimer != 0); + } + if ((dst->d_icount > dst->d_ino_softlimit) && + (dst->d_ino_softlimit > 0)) { + ASSERT(dst->d_itimer != 0); + } + } +#endif +out_put: + xfs_qm_dqput(dqp); + return error; +} + +STATIC uint +xfs_qm_export_qtype_flags( + uint flags) +{ + /* + * Can't be more than one, or none. + */ + ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) != + (FS_PROJ_QUOTA | FS_USER_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) != + (FS_PROJ_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) != + (FS_USER_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0); + + return (flags & XFS_DQ_USER) ? + FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? + FS_PROJ_QUOTA : FS_GROUP_QUOTA; +} + +STATIC uint +xfs_qm_export_flags( + uint flags) +{ + uint uflags; + + uflags = 0; + if (flags & XFS_UQUOTA_ACCT) + uflags |= FS_QUOTA_UDQ_ACCT; + if (flags & XFS_GQUOTA_ACCT) + uflags |= FS_QUOTA_GDQ_ACCT; + if (flags & XFS_PQUOTA_ACCT) + uflags |= FS_QUOTA_PDQ_ACCT; + if (flags & XFS_UQUOTA_ENFD) + uflags |= FS_QUOTA_UDQ_ENFD; + if (flags & XFS_GQUOTA_ENFD) + uflags |= FS_QUOTA_GDQ_ENFD; + if (flags & XFS_PQUOTA_ENFD) + uflags |= FS_QUOTA_PDQ_ENFD; + return (uflags); +} + + +STATIC int +xfs_dqrele_inode( + struct xfs_inode *ip, + int flags, + void *args) +{ + /* skip quota inodes */ + if (ip == ip->i_mount->m_quotainfo->qi_uquotaip || + ip == ip->i_mount->m_quotainfo->qi_gquotaip || + ip == ip->i_mount->m_quotainfo->qi_pquotaip) { + ASSERT(ip->i_udquot == NULL); + ASSERT(ip->i_gdquot == NULL); + ASSERT(ip->i_pdquot == NULL); + return 0; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) { + xfs_qm_dqrele(ip->i_udquot); + ip->i_udquot = NULL; + } + if ((flags & XFS_GQUOTA_ACCT) && ip->i_gdquot) { + xfs_qm_dqrele(ip->i_gdquot); + ip->i_gdquot = NULL; + } + if ((flags & XFS_PQUOTA_ACCT) && ip->i_pdquot) { + xfs_qm_dqrele(ip->i_pdquot); + ip->i_pdquot = NULL; + } + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; +} + + +/* + * Go thru all the inodes in the file system, releasing their dquots. + * + * Note that the mount structure gets modified to indicate that quotas are off + * AFTER this, in the case of quotaoff. + */ +void +xfs_qm_dqrele_all_inodes( + struct xfs_mount *mp, + uint flags) +{ + ASSERT(mp->m_quotainfo); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); +} diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c deleted file mode 100644 index 320d63ff9ca..00000000000 --- a/fs/xfs/xfs_qmops.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_quota.h" -#include "xfs_error.h" - -STATIC struct xfs_dquot * -xfs_dqvopchown_default( - struct xfs_trans *tp, - struct xfs_inode *ip, - struct xfs_dquot **dqp, - struct xfs_dquot *dq) -{ - return NULL; -} - -/* - * Clear the quotaflags in memory and in the superblock. - */ -int -xfs_mount_reset_sbqflags(xfs_mount_t *mp) -{ - int error; - xfs_trans_t *tp; - unsigned long s; - - mp->m_qflags = 0; - /* - * It is OK to look at sb_qflags here in mount path, - * without SB_LOCK. - */ - if (mp->m_sb.sb_qflags == 0) - return 0; - s = XFS_SB_LOCK(mp); - mp->m_sb.sb_qflags = 0; - XFS_SB_UNLOCK(mp, s); - - /* - * if the fs is readonly, let the incore superblock run - * with quotas off but don't flush the update out to disk - */ - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) - return 0; -#ifdef QUOTADEBUG - xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); -#endif - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); - if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, - XFS_DEFAULT_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_mount_reset_sbqflags: Superblock update failed!"); - return error; - } - xfs_mod_sb(tp, XFS_SB_QFLAGS); - error = xfs_trans_commit(tp, 0, NULL); - return error; -} - -STATIC int -xfs_noquota_init( - xfs_mount_t *mp, - uint *needquotamount, - uint *quotaflags) -{ - int error = 0; - - *quotaflags = 0; - *needquotamount = B_FALSE; - - ASSERT(!XFS_IS_QUOTA_ON(mp)); - - /* - * If a file system had quotas running earlier, but decided to - * mount without -o uquota/pquota/gquota options, revoke the - * quotachecked license. - */ - if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { - cmn_err(CE_NOTE, - "XFS resetting qflags for filesystem %s", - mp->m_fsname); - - error = xfs_mount_reset_sbqflags(mp); - } - return error; -} - -xfs_qmops_t xfs_qmcore_stub = { - .xfs_qminit = (xfs_qminit_t) xfs_noquota_init, - .xfs_qmdone = (xfs_qmdone_t) fs_noerr, - .xfs_qmmount = (xfs_qmmount_t) fs_noerr, - .xfs_qmunmount = (xfs_qmunmount_t) fs_noerr, - .xfs_dqrele = (xfs_dqrele_t) fs_noerr, - .xfs_dqattach = (xfs_dqattach_t) fs_noerr, - .xfs_dqdetach = (xfs_dqdetach_t) fs_noerr, - .xfs_dqpurgeall = (xfs_dqpurgeall_t) fs_noerr, - .xfs_dqvopalloc = (xfs_dqvopalloc_t) fs_noerr, - .xfs_dqvopcreate = (xfs_dqvopcreate_t) fs_noerr, - .xfs_dqvoprename = (xfs_dqvoprename_t) fs_noerr, - .xfs_dqvopchown = xfs_dqvopchown_default, - .xfs_dqvopchownresv = (xfs_dqvopchownresv_t) fs_noerr, -}; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index acb853b33eb..5376dd406ba 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -18,234 +18,15 @@ #ifndef __XFS_QUOTA_H__ #define __XFS_QUOTA_H__ -/* - * The ondisk form of a dquot structure. - */ -#define XFS_DQUOT_MAGIC 0x4451 /* 'DQ' */ -#define XFS_DQUOT_VERSION (u_int8_t)0x01 /* latest version number */ - -/* - * uid_t and gid_t are hard-coded to 32 bits in the inode. - * Hence, an 'id' in a dquot is 32 bits.. - */ -typedef __uint32_t xfs_dqid_t; - -/* - * Even though users may not have quota limits occupying all 64-bits, - * they may need 64-bit accounting. Hence, 64-bit quota-counters, - * and quota-limits. This is a waste in the common case, but hey ... - */ -typedef __uint64_t xfs_qcnt_t; -typedef __uint16_t xfs_qwarncnt_t; - -/* - * This is the main portion of the on-disk representation of quota - * information for a user. This is the q_core of the xfs_dquot_t that - * is kept in kernel memory. We pad this with some more expansion room - * to construct the on disk structure. - */ -typedef struct xfs_disk_dquot { - __be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */ - __u8 d_version; /* dquot version */ - __u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */ - __be32 d_id; /* user,project,group id */ - __be64 d_blk_hardlimit;/* absolute limit on disk blks */ - __be64 d_blk_softlimit;/* preferred limit on disk blks */ - __be64 d_ino_hardlimit;/* maximum # allocated inodes */ - __be64 d_ino_softlimit;/* preferred inode limit */ - __be64 d_bcount; /* disk blocks owned by the user */ - __be64 d_icount; /* inodes owned by the user */ - __be32 d_itimer; /* zero if within inode limits if not, - this is when we refuse service */ - __be32 d_btimer; /* similar to above; for disk blocks */ - __be16 d_iwarns; /* warnings issued wrt num inodes */ - __be16 d_bwarns; /* warnings issued wrt disk blocks */ - __be32 d_pad0; /* 64 bit align */ - __be64 d_rtb_hardlimit;/* absolute limit on realtime blks */ - __be64 d_rtb_softlimit;/* preferred limit on RT disk blks */ - __be64 d_rtbcount; /* realtime blocks owned */ - __be32 d_rtbtimer; /* similar to above; for RT disk blocks */ - __be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */ - __be16 d_pad; -} xfs_disk_dquot_t; - -/* - * This is what goes on disk. This is separated from the xfs_disk_dquot because - * carrying the unnecessary padding would be a waste of memory. - */ -typedef struct xfs_dqblk { - xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */ - char dd_fill[32]; /* filling for posterity */ -} xfs_dqblk_t; - -/* - * flags for q_flags field in the dquot. - */ -#define XFS_DQ_USER 0x0001 /* a user quota */ -#define XFS_DQ_PROJ 0x0002 /* project quota */ -#define XFS_DQ_GROUP 0x0004 /* a group quota */ -#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */ -#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */ -#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */ -#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */ -#define XFS_DQ_MARKER 0x0080 /* sentinel */ - -#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) +#include "xfs_quota_defs.h" /* - * In the worst case, when both user and group quotas are on, - * we can have a max of three dquots changing in a single transaction. + * Kernel only quota definitions and functions */ -#define XFS_DQUOT_LOGRES(mp) (sizeof(xfs_disk_dquot_t) * 3) +struct xfs_trans; /* - * These are the structures used to lay out dquots and quotaoff - * records on the log. Quite similar to those of inodes. - */ - -/* - * log format struct for dquots. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - */ -typedef struct xfs_dq_logformat { - __uint16_t qlf_type; /* dquot log item type */ - __uint16_t qlf_size; /* size of this item */ - xfs_dqid_t qlf_id; /* usr/grp/proj id : 32 bits */ - __int64_t qlf_blkno; /* blkno of dquot buffer */ - __int32_t qlf_len; /* len of dquot buffer */ - __uint32_t qlf_boffset; /* off of dquot in buffer */ -} xfs_dq_logformat_t; - -/* - * log format struct for QUOTAOFF records. - * The first two fields must be the type and size fitting into - * 32 bits : log_recovery code assumes that. - * We write two LI_QUOTAOFF logitems per quotaoff, the last one keeps a pointer - * to the first and ensures that the first logitem is taken out of the AIL - * only when the last one is securely committed. - */ -typedef struct xfs_qoff_logformat { - unsigned short qf_type; /* quotaoff log item type */ - unsigned short qf_size; /* size of this item */ - unsigned int qf_flags; /* USR and/or GRP */ - char qf_pad[12]; /* padding for future */ -} xfs_qoff_logformat_t; - - -/* - * Disk quotas status in m_qflags, and also sb_qflags. 16 bits. - */ -#define XFS_UQUOTA_ACCT 0x0001 /* user quota accounting ON */ -#define XFS_UQUOTA_ENFD 0x0002 /* user quota limits enforced */ -#define XFS_UQUOTA_CHKD 0x0004 /* quotacheck run on usr quotas */ -#define XFS_PQUOTA_ACCT 0x0008 /* project quota accounting ON */ -#define XFS_OQUOTA_ENFD 0x0010 /* other (grp/prj) quota limits enforced */ -#define XFS_OQUOTA_CHKD 0x0020 /* quotacheck run on other (grp/prj) quotas */ -#define XFS_GQUOTA_ACCT 0x0040 /* group quota accounting ON */ - -/* - * Quota Accounting/Enforcement flags - */ -#define XFS_ALL_QUOTA_ACCT \ - (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT) -#define XFS_ALL_QUOTA_ENFD (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD) -#define XFS_ALL_QUOTA_CHKD (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD) - -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_QUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) - -/* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x0100 /* uquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x0200 /* pquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x0400 /* gquotas are being turned off */ - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - -/* - * Flags to tell various functions what to do. Not all of these are meaningful - * to a single function. None of these XFS_QMOPT_* flags are meant to have - * persistent values (ie. their values can and will change between versions) - */ -#define XFS_QMOPT_DQLOCK 0x0000001 /* dqlock */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ -#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ -#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ -#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ -#define XFS_QMOPT_DQSUSER 0x0000020 /* don't cache super users dquot */ -#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ -#define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ -#define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ -#define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ -#define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ -#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ -#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ - -/* - * flags to xfs_trans_mod_dquot to indicate which field needs to be - * modified. - */ -#define XFS_QMOPT_RES_REGBLKS 0x0010000 -#define XFS_QMOPT_RES_RTBLKS 0x0020000 -#define XFS_QMOPT_BCOUNT 0x0040000 -#define XFS_QMOPT_ICOUNT 0x0080000 -#define XFS_QMOPT_RTBCOUNT 0x0100000 -#define XFS_QMOPT_DELBCOUNT 0x0200000 -#define XFS_QMOPT_DELRTBCOUNT 0x0400000 -#define XFS_QMOPT_RES_INOS 0x0800000 - -/* - * flags for dqflush and dqflush_all. - */ -#define XFS_QMOPT_SYNC 0x1000000 -#define XFS_QMOPT_ASYNC 0x2000000 -#define XFS_QMOPT_DELWRI 0x4000000 - -/* - * flags for dqalloc. - */ -#define XFS_QMOPT_INHERIT 0x8000000 - -/* - * flags to xfs_trans_mod_dquot. - */ -#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS -#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS -#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS -#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT -#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT -#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT -#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT -#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT - - -#define XFS_QMOPT_QUOTALL \ - (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) -#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) - -#ifdef __KERNEL__ -/* * This check is done typically without holding the inode lock; * that may seem racy, but it is harmless in the context that it is used. * The inode cannot go inactive as long a reference is kept, and @@ -254,36 +35,18 @@ typedef struct xfs_qoff_logformat { * we didn't have the inode locked, the appropriate dquot(s) will be * attached atomically. */ -#define XFS_NOT_DQATTACHED(mp, ip) ((XFS_IS_UQUOTA_ON(mp) &&\ - (ip)->i_udquot == NULL) || \ - (XFS_IS_OQUOTA_ON(mp) && \ - (ip)->i_gdquot == NULL)) +#define XFS_NOT_DQATTACHED(mp, ip) \ + ((XFS_IS_UQUOTA_ON(mp) && (ip)->i_udquot == NULL) || \ + (XFS_IS_GQUOTA_ON(mp) && (ip)->i_gdquot == NULL) || \ + (XFS_IS_PQUOTA_ON(mp) && (ip)->i_pdquot == NULL)) #define XFS_QM_NEED_QUOTACHECK(mp) \ ((XFS_IS_UQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_UQUOTA_CHKD) == 0) || \ (XFS_IS_GQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT))) || \ + (mp->m_sb.sb_qflags & XFS_GQUOTA_CHKD) == 0) || \ (XFS_IS_PQUOTA_ON(mp) && \ - ((mp->m_sb.sb_qflags & XFS_OQUOTA_CHKD) == 0 || \ - (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT)))) - -#define XFS_MOUNT_QUOTA_SET1 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) - -#define XFS_MOUNT_QUOTA_SET2 (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_GQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD) - -#define XFS_MOUNT_QUOTA_ALL (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD|\ - XFS_UQUOTA_CHKD|XFS_PQUOTA_ACCT|\ - XFS_OQUOTA_ENFD|XFS_OQUOTA_CHKD|\ - XFS_GQUOTA_ACCT) -#define XFS_MOUNT_QUOTA_MASK (XFS_MOUNT_QUOTA_ALL | XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) - + (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) /* * The structure kept inside the xfs_trans_t keep track of dquot changes @@ -304,69 +67,89 @@ typedef struct xfs_dqtrx { long qt_delrtb_delta; /* delayed RT blk count changes */ } xfs_dqtrx_t; -/* - * Dquot transaction functions, used if quota is enabled. - */ -typedef void (*qo_dup_dqinfo_t)(struct xfs_trans *, struct xfs_trans *); -typedef void (*qo_mod_dquot_byino_t)(struct xfs_trans *, - struct xfs_inode *, uint, long); -typedef void (*qo_free_dqinfo_t)(struct xfs_trans *); -typedef void (*qo_apply_dquot_deltas_t)(struct xfs_trans *); -typedef void (*qo_unreserve_and_mod_dquots_t)(struct xfs_trans *); -typedef int (*qo_reserve_quota_nblks_t)( - struct xfs_trans *, struct xfs_mount *, - struct xfs_inode *, long, long, uint); -typedef int (*qo_reserve_quota_bydquots_t)( - struct xfs_trans *, struct xfs_mount *, - struct xfs_dquot *, struct xfs_dquot *, - long, long, uint); -typedef struct xfs_dqtrxops { - qo_dup_dqinfo_t qo_dup_dqinfo; - qo_free_dqinfo_t qo_free_dqinfo; - qo_mod_dquot_byino_t qo_mod_dquot_byino; - qo_apply_dquot_deltas_t qo_apply_dquot_deltas; - qo_reserve_quota_nblks_t qo_reserve_quota_nblks; - qo_reserve_quota_bydquots_t qo_reserve_quota_bydquots; - qo_unreserve_and_mod_dquots_t qo_unreserve_and_mod_dquots; -} xfs_dqtrxops_t; - -#define XFS_DQTRXOP(mp, tp, op, args...) \ - ((mp)->m_qm_ops.xfs_dqtrxops ? \ - ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : 0) - -#define XFS_DQTRXOP_VOID(mp, tp, op, args...) \ - ((mp)->m_qm_ops.xfs_dqtrxops ? \ - ((mp)->m_qm_ops.xfs_dqtrxops->op)(tp, ## args) : (void)0) - -#define XFS_TRANS_DUP_DQINFO(mp, otp, ntp) \ - XFS_DQTRXOP_VOID(mp, otp, qo_dup_dqinfo, ntp) -#define XFS_TRANS_FREE_DQINFO(mp, tp) \ - XFS_DQTRXOP_VOID(mp, tp, qo_free_dqinfo) -#define XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, field, delta) \ - XFS_DQTRXOP_VOID(mp, tp, qo_mod_dquot_byino, ip, field, delta) -#define XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp) \ - XFS_DQTRXOP_VOID(mp, tp, qo_apply_dquot_deltas) -#define XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, fl) \ - XFS_DQTRXOP(mp, tp, qo_reserve_quota_nblks, mp, ip, nblks, ninos, fl) -#define XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, fl) \ - XFS_DQTRXOP(mp, tp, qo_reserve_quota_bydquots, mp, ud, gd, nb, ni, fl) -#define XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp) \ - XFS_DQTRXOP_VOID(mp, tp, qo_unreserve_and_mod_dquots) - -#define XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, nblks, ninos, flags) \ - XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, -(nblks), -(ninos), flags) -#define XFS_TRANS_RESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ - XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, nb, ni, \ - f | XFS_QMOPT_RES_REGBLKS) -#define XFS_TRANS_UNRESERVE_QUOTA(mp, tp, ud, gd, nb, ni, f) \ - XFS_TRANS_RESERVE_QUOTA_BYDQUOTS(mp, tp, ud, gd, -(nb), -(ni), \ +#ifdef CONFIG_XFS_QUOTA +extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *); +extern void xfs_trans_free_dqinfo(struct xfs_trans *); +extern void xfs_trans_mod_dquot_byino(struct xfs_trans *, struct xfs_inode *, + uint, long); +extern void xfs_trans_apply_dquot_deltas(struct xfs_trans *); +extern void xfs_trans_unreserve_and_mod_dquots(struct xfs_trans *); +extern int xfs_trans_reserve_quota_nblks(struct xfs_trans *, + struct xfs_inode *, long, long, uint); +extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *, + struct xfs_mount *, struct xfs_dquot *, + struct xfs_dquot *, struct xfs_dquot *, long, long, uint); + +extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t, + prid_t, uint, struct xfs_dquot **, struct xfs_dquot **, + struct xfs_dquot **); +extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *); +extern int xfs_qm_vop_rename_dqattach(struct xfs_inode **); +extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, + struct xfs_inode *, struct xfs_dquot **, struct xfs_dquot *); +extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, + struct xfs_dquot *, struct xfs_dquot *, + struct xfs_dquot *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *, uint); +extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern void xfs_qm_dqdetach(struct xfs_inode *); +extern void xfs_qm_dqrele(struct xfs_dquot *); +extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); +extern int xfs_qm_newmount(struct xfs_mount *, uint *, uint *); +extern void xfs_qm_mount_quotas(struct xfs_mount *); +extern void xfs_qm_unmount(struct xfs_mount *); +extern void xfs_qm_unmount_quotas(struct xfs_mount *); + +#else +static inline int +xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid, + prid_t prid, uint flags, struct xfs_dquot **udqp, + struct xfs_dquot **gdqp, struct xfs_dquot **pdqp) +{ + *udqp = NULL; + *gdqp = NULL; + *pdqp = NULL; + return 0; +} +#define xfs_trans_dup_dqinfo(tp, tp2) +#define xfs_trans_free_dqinfo(tp) +#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) +#define xfs_trans_apply_dquot_deltas(tp) +#define xfs_trans_unreserve_and_mod_dquots(tp) +static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp, + struct xfs_inode *ip, long nblks, long ninos, uint flags) +{ + return 0; +} +static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, + struct xfs_mount *mp, struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, + long nblks, long nions, uint flags) +{ + return 0; +} +#define xfs_qm_vop_create_dqattach(tp, ip, u, g, p) +#define xfs_qm_vop_rename_dqattach(it) (0) +#define xfs_qm_vop_chown(tp, ip, old, new) (NULL) +#define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) +#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach_locked(ip, fl) (0) +#define xfs_qm_dqdetach(ip) +#define xfs_qm_dqrele(d) +#define xfs_qm_statvfs(ip, s) +#define xfs_qm_newmount(mp, a, b) (0) +#define xfs_qm_mount_quotas(mp) +#define xfs_qm_unmount(mp) +#define xfs_qm_unmount_quotas(mp) +#endif /* CONFIG_XFS_QUOTA */ + +#define xfs_trans_unreserve_quota_nblks(tp, ip, nblks, ninos, flags) \ + xfs_trans_reserve_quota_nblks(tp, ip, -(nblks), -(ninos), flags) +#define xfs_trans_reserve_quota(tp, mp, ud, gd, pd, nb, ni, f) \ + xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \ f | XFS_QMOPT_RES_REGBLKS) -extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); extern int xfs_mount_reset_sbqflags(struct xfs_mount *); -extern struct bhv_module_vfsops xfs_qmops; - -#endif /* __KERNEL__ */ - #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h new file mode 100644 index 00000000000..137e2093707 --- /dev/null +++ b/fs/xfs/xfs_quota_defs.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_QUOTA_DEFS_H__ +#define __XFS_QUOTA_DEFS_H__ + +/* + * Quota definitions shared between user and kernel source trees. + */ + +/* + * Even though users may not have quota limits occupying all 64-bits, + * they may need 64-bit accounting. Hence, 64-bit quota-counters, + * and quota-limits. This is a waste in the common case, but hey ... + */ +typedef __uint64_t xfs_qcnt_t; +typedef __uint16_t xfs_qwarncnt_t; + +/* + * flags for q_flags field in the dquot. + */ +#define XFS_DQ_USER 0x0001 /* a user quota */ +#define XFS_DQ_PROJ 0x0002 /* project quota */ +#define XFS_DQ_GROUP 0x0004 /* a group quota */ +#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */ +#define XFS_DQ_FREEING 0x0010 /* dquot is beeing torn down */ + +#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP) + +#define XFS_DQ_FLAGS \ + { XFS_DQ_USER, "USER" }, \ + { XFS_DQ_PROJ, "PROJ" }, \ + { XFS_DQ_GROUP, "GROUP" }, \ + { XFS_DQ_DIRTY, "DIRTY" }, \ + { XFS_DQ_FREEING, "FREEING" } + +/* + * We have the possibility of all three quota types being active at once, and + * hence free space modification requires modification of all three current + * dquots in a single transaction. For this case we need to have a reservation + * of at least 3 dquots. + * + * However, a chmod operation can change both UID and GID in a single + * transaction, resulting in requiring {old, new} x {uid, gid} dquots to be + * modified. Hence for this case we need to reserve space for at least 4 dquots. + * + * And in the worst case, there's a rename operation that can be modifying up to + * 4 inodes with dquots attached to them. In reality, the only inodes that can + * have their dquots modified are the source and destination directory inodes + * due to directory name creation and removal. That can require space allocation + * and/or freeing on both directory inodes, and hence all three dquots on each + * inode can be modified. And if the directories are world writeable, all the + * dquots can be unique and so 6 dquots can be modified.... + * + * And, of course, we also need to take into account the dquot log format item + * used to describe each dquot. + */ +#define XFS_DQUOT_LOGRES(mp) \ + ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) + +#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) +#define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) +#define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) + +/* + * Incore only flags for quotaoff - these bits get cleared when quota(s) + * are in the process of getting turned off. These flags are in m_qflags but + * never in sb_qflags. + */ +#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ +#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ +#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ +#define XFS_ALL_QUOTA_ACTIVE \ + (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) + +/* + * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees + * quota will be not be switched off as long as that inode lock is held. + */ +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ + XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \ + XFS_PQUOTA_ACTIVE)) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) + +/* + * Flags to tell various functions what to do. Not all of these are meaningful + * to a single function. None of these XFS_QMOPT_* flags are meant to have + * persistent values (ie. their values can and will change between versions) + */ +#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ +#define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ +#define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ +#define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ +#define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ +#define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ + +/* + * flags to xfs_trans_mod_dquot to indicate which field needs to be + * modified. + */ +#define XFS_QMOPT_RES_REGBLKS 0x0010000 +#define XFS_QMOPT_RES_RTBLKS 0x0020000 +#define XFS_QMOPT_BCOUNT 0x0040000 +#define XFS_QMOPT_ICOUNT 0x0080000 +#define XFS_QMOPT_RTBCOUNT 0x0100000 +#define XFS_QMOPT_DELBCOUNT 0x0200000 +#define XFS_QMOPT_DELRTBCOUNT 0x0400000 +#define XFS_QMOPT_RES_INOS 0x0800000 + +/* + * flags for dqalloc. + */ +#define XFS_QMOPT_INHERIT 0x1000000 + +/* + * flags to xfs_trans_mod_dquot. + */ +#define XFS_TRANS_DQ_RES_BLKS XFS_QMOPT_RES_REGBLKS +#define XFS_TRANS_DQ_RES_RTBLKS XFS_QMOPT_RES_RTBLKS +#define XFS_TRANS_DQ_RES_INOS XFS_QMOPT_RES_INOS +#define XFS_TRANS_DQ_BCOUNT XFS_QMOPT_BCOUNT +#define XFS_TRANS_DQ_DELBCOUNT XFS_QMOPT_DELBCOUNT +#define XFS_TRANS_DQ_ICOUNT XFS_QMOPT_ICOUNT +#define XFS_TRANS_DQ_RTBCOUNT XFS_QMOPT_RTBCOUNT +#define XFS_TRANS_DQ_DELRTBCOUNT XFS_QMOPT_DELRTBCOUNT + + +#define XFS_QMOPT_QUOTALL \ + (XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA) +#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) + +extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, + xfs_dqid_t id, uint type, uint flags, char *str); +extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); + +#endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c new file mode 100644 index 00000000000..2ad1b9822e9 --- /dev/null +++ b/fs/xfs/xfs_quotaops.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2008, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include <linux/quota.h> + + +STATIC int +xfs_quota_type(int type) +{ + switch (type) { + case USRQUOTA: + return XFS_DQ_USER; + case GRPQUOTA: + return XFS_DQ_GROUP; + default: + return XFS_DQ_PROJ; + } +} + +STATIC int +xfs_fs_get_xstate( + struct super_block *sb, + struct fs_quota_stat *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstat(mp, fqs); +} + +STATIC int +xfs_fs_get_xstatev( + struct super_block *sb, + struct fs_quota_statv *fqs) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + return -xfs_qm_scall_getqstatv(mp, fqs); +} + +STATIC int +xfs_fs_set_xstate( + struct super_block *sb, + unsigned int uflags, + int op) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + + if (uflags & FS_QUOTA_UDQ_ACCT) + flags |= XFS_UQUOTA_ACCT; + if (uflags & FS_QUOTA_PDQ_ACCT) + flags |= XFS_PQUOTA_ACCT; + if (uflags & FS_QUOTA_GDQ_ACCT) + flags |= XFS_GQUOTA_ACCT; + if (uflags & FS_QUOTA_UDQ_ENFD) + flags |= XFS_UQUOTA_ENFD; + if (uflags & FS_QUOTA_GDQ_ENFD) + flags |= XFS_GQUOTA_ENFD; + if (uflags & FS_QUOTA_PDQ_ENFD) + flags |= XFS_PQUOTA_ENFD; + + switch (op) { + case Q_XQUOTAON: + return -xfs_qm_scall_quotaon(mp, flags); + case Q_XQUOTAOFF: + if (!XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + return -xfs_qm_scall_quotaoff(mp, flags); + } + + return -EINVAL; +} + +STATIC int +xfs_fs_rm_xquota( + struct super_block *sb, + unsigned int uflags) +{ + struct xfs_mount *mp = XFS_M(sb); + unsigned int flags = 0; + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + + if (XFS_IS_QUOTA_ON(mp)) + return -EINVAL; + + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_USER; + if (uflags & FS_GROUP_QUOTA) + flags |= XFS_DQ_GROUP; + if (uflags & FS_USER_QUOTA) + flags |= XFS_DQ_PROJ; + + return -xfs_qm_scall_trunc_qfiles(mp, flags); +} + +STATIC int +xfs_fs_get_dqblk( + struct super_block *sb, + struct kqid qid, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); +} + +STATIC int +xfs_fs_set_dqblk( + struct super_block *sb, + struct kqid qid, + struct fs_disk_quota *fdq) +{ + struct xfs_mount *mp = XFS_M(sb); + + if (sb->s_flags & MS_RDONLY) + return -EROFS; + if (!XFS_IS_QUOTA_RUNNING(mp)) + return -ENOSYS; + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + + return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), + xfs_quota_type(qid.type), fdq); +} + +const struct quotactl_ops xfs_quotactl_operations = { + .get_xstatev = xfs_fs_get_xstatev, + .get_xstate = xfs_fs_get_xstate, + .set_xstate = xfs_fs_set_xstate, + .rm_xquota = xfs_fs_rm_xquota, + .get_dqblk = xfs_fs_get_dqblk, + .set_dqblk = xfs_fs_set_dqblk, +}; diff --git a/fs/xfs/xfs_refcache.h b/fs/xfs/xfs_refcache.h deleted file mode 100644 index 2dec79edb51..00000000000 --- a/fs/xfs/xfs_refcache.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_REFCACHE_H__ -#define __XFS_REFCACHE_H__ - -#ifdef HAVE_REFCACHE -/* - * Maximum size (in inodes) for the NFS reference cache - */ -#define XFS_REFCACHE_SIZE_MAX 512 - -struct xfs_inode; -struct xfs_mount; - -extern void xfs_refcache_insert(struct xfs_inode *); -extern void xfs_refcache_purge_ip(struct xfs_inode *); -extern void xfs_refcache_purge_mp(struct xfs_mount *); -extern void xfs_refcache_purge_some(struct xfs_mount *); -extern void xfs_refcache_resize(int); -extern void xfs_refcache_destroy(void); - -extern void xfs_refcache_iunlock(struct xfs_inode *, uint); - -#else - -#define xfs_refcache_insert(ip) do { } while (0) -#define xfs_refcache_purge_ip(ip) do { } while (0) -#define xfs_refcache_purge_mp(mp) do { } while (0) -#define xfs_refcache_purge_some(mp) do { } while (0) -#define xfs_refcache_resize(size) do { } while (0) -#define xfs_refcache_destroy() do { } while (0) - -#define xfs_refcache_iunlock(ip, flags) xfs_iunlock(ip, flags) - -#endif - -#endif /* __XFS_REFCACHE_H__ */ diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c deleted file mode 100644 index d98171deaa1..00000000000 --- a/fs/xfs/xfs_rename.c +++ /dev/null @@ -1,631 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_refcache.h" -#include "xfs_utils.h" -#include "xfs_trans_space.h" - - -/* - * Given an array of up to 4 inode pointers, unlock the pointed to inodes. - * If there are fewer than 4 entries in the array, the empty entries will - * be at the end and will have NULL pointers in them. - */ -STATIC void -xfs_rename_unlock4( - xfs_inode_t **i_tab, - uint lock_mode) -{ - int i; - - xfs_iunlock(i_tab[0], lock_mode); - for (i = 1; i < 4; i++) { - if (i_tab[i] == NULL) { - break; - } - /* - * Watch out for duplicate entries in the table. - */ - if (i_tab[i] != i_tab[i-1]) { - xfs_iunlock(i_tab[i], lock_mode); - } - } -} - -#ifdef DEBUG -int xfs_rename_skip, xfs_rename_nskip; -#endif - -/* - * The following routine will acquire the locks required for a rename - * operation. The code understands the semantics of renames and will - * validate that name1 exists under dp1 & that name2 may or may not - * exist under dp2. - * - * We are renaming dp1/name1 to dp2/name2. - * - * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success. - */ -STATIC int -xfs_lock_for_rename( - xfs_inode_t *dp1, /* old (source) directory inode */ - xfs_inode_t *dp2, /* new (target) directory inode */ - bhv_vname_t *vname1,/* old entry name */ - bhv_vname_t *vname2,/* new entry name */ - xfs_inode_t **ipp1, /* inode of old entry */ - xfs_inode_t **ipp2, /* inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* array of inode returned, sorted */ - int *num_inodes) /* number of inodes in array */ -{ - xfs_inode_t *ip1, *ip2, *temp; - xfs_ino_t inum1, inum2; - int error; - int i, j; - uint lock_mode; - int diff_dirs = (dp1 != dp2); - - ip2 = NULL; - - /* - * First, find out the current inums of the entries so that we - * can determine the initial locking order. We'll have to - * sanity check stuff after all the locks have been acquired - * to see if we still have the right inodes, directories, etc. - */ - lock_mode = xfs_ilock_map_shared(dp1); - error = xfs_get_dir_entry(vname1, &ip1); - if (error) { - xfs_iunlock_map_shared(dp1, lock_mode); - return error; - } - - inum1 = ip1->i_ino; - - ASSERT(ip1); - ITRACE(ip1); - - /* - * Unlock dp1 and lock dp2 if they are different. - */ - - if (diff_dirs) { - xfs_iunlock_map_shared(dp1, lock_mode); - lock_mode = xfs_ilock_map_shared(dp2); - } - - error = xfs_dir_lookup_int(XFS_ITOBHV(dp2), lock_mode, - vname2, &inum2, &ip2); - if (error == ENOENT) { /* target does not need to exist. */ - inum2 = 0; - } else if (error) { - /* - * If dp2 and dp1 are the same, the next line unlocks dp1. - * Got it? - */ - xfs_iunlock_map_shared(dp2, lock_mode); - IRELE (ip1); - return error; - } else { - ITRACE(ip2); - } - - /* - * i_tab contains a list of pointers to inodes. We initialize - * the table here & we'll sort it. We will then use it to - * order the acquisition of the inode locks. - * - * Note that the table may contain duplicates. e.g., dp1 == dp2. - */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (inum2 == 0) { - *num_inodes = 3; - i_tab[3] = NULL; - } else { - *num_inodes = 4; - i_tab[3] = ip2; - } - - /* - * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) - */ - for (i=0; i < *num_inodes; i++) { - for (j=1; j < *num_inodes; j++) { - if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; - i_tab[j] = i_tab[j-1]; - i_tab[j-1] = temp; - } - } - } - - /* - * We have dp2 locked. If it isn't first, unlock it. - * If it is first, tell xfs_lock_inodes so it can skip it - * when locking. if dp1 == dp2, xfs_lock_inodes will skip both - * since they are equal. xfs_lock_inodes needs all these inodes - * so that it can unlock and retry if there might be a dead-lock - * potential with the log. - */ - - if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) { -#ifdef DEBUG - xfs_rename_skip++; -#endif - xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED); - } else { -#ifdef DEBUG - xfs_rename_nskip++; -#endif - xfs_iunlock_map_shared(dp2, lock_mode); - xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); - } - - /* - * Set the return value. Null out any unused entries in i_tab. - */ - *ipp1 = *ipp2 = NULL; - for (i=0; i < *num_inodes; i++) { - if (i_tab[i]->i_ino == inum1) { - *ipp1 = i_tab[i]; - } - if (i_tab[i]->i_ino == inum2) { - *ipp2 = i_tab[i]; - } - } - for (;i < 4; i++) { - i_tab[i] = NULL; - } - return 0; -} - -/* - * xfs_rename - */ -int -xfs_rename( - bhv_desc_t *src_dir_bdp, - bhv_vname_t *src_vname, - bhv_vnode_t *target_dir_vp, - bhv_vname_t *target_vname, - cred_t *credp) -{ - xfs_trans_t *tp; - xfs_inode_t *src_dp, *target_dp, *src_ip, *target_ip; - xfs_mount_t *mp; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int target_ip_dropped = 0; /* dropped target_ip link? */ - bhv_vnode_t *src_dir_vp; - int spaceres; - int target_link_zero = 0; - int num_inodes; - char *src_name = VNAME(src_vname); - char *target_name = VNAME(target_vname); - int src_namelen = VNAMELEN(src_vname); - int target_namelen = VNAMELEN(target_vname); - - src_dir_vp = BHV_TO_VNODE(src_dir_bdp); - vn_trace_entry(src_dir_vp, "xfs_rename", (inst_t *)__return_address); - vn_trace_entry(target_dir_vp, "xfs_rename", (inst_t *)__return_address); - - /* - * Find the XFS behavior descriptor for the target directory - * vnode since it was not handed to us. - */ - target_dp = xfs_vtoi(target_dir_vp); - if (target_dp == NULL) { - return XFS_ERROR(EXDEV); - } - - src_dp = XFS_BHVTOI(src_dir_bdp); - mp = src_dp->i_mount; - - if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_RENAME) || - DM_EVENT_ENABLED(target_dir_vp->v_vfsp, - target_dp, DM_EVENT_RENAME)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, - 0, 0, 0); - if (error) { - return error; - } - } - /* Return through std_return after this point. */ - - /* - * Lock all the participating inodes. Depending upon whether - * the target_name exists in the target directory, and - * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. - * xfs_lock_for_rename() will return ENOENT if src_name - * does not exist in the source directory. - */ - tp = NULL; - error = xfs_lock_for_rename(src_dp, target_dp, src_vname, - target_vname, &src_ip, &target_ip, inodes, - &num_inodes); - - if (error) { - /* - * We have nothing locked, no inode references, and - * no transaction, so just get out. - */ - goto std_return; - } - - ASSERT(src_ip != NULL); - - if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - /* - * Check for link count overflow on target_dp - */ - if (target_ip == NULL && (src_dp != target_dp) && - target_dp->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; - } - } - - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { - error = XFS_ERROR(EXDEV); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; - } - - new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); - - /* - * Drop the locks on our inodes so that we can start the transaction. - */ - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - - XFS_BMAP_INIT(&free_list, &first_block); - tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen); - error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - if (error == ENOSPC) { - spaceres = 0; - error = xfs_trans_reserve(tp, 0, XFS_RENAME_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); - } - if (error) { - xfs_trans_cancel(tp, 0); - goto rele_return; - } - - /* - * Attach the dquots to the inodes - */ - if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { - xfs_trans_cancel(tp, cancel_flags); - goto rele_return; - } - - /* - * Reacquire the inode locks we dropped above. - */ - xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL); - - /* - * Join all the inodes to the transaction. From this point on, - * we can rely on either trans_commit or trans_cancel to unlock - * them. Note that we need to add a vnode reference to the - * directories since trans_commit & trans_cancel will decrement - * them when they unlock the inodes. Also, we need to be careful - * not to add an inode to the transaction more than once. - */ - VN_HOLD(src_dir_vp); - xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); - if (new_parent) { - VN_HOLD(target_dir_vp); - xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); - } - if ((src_ip != src_dp) && (src_ip != target_dp)) { - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - } - if ((target_ip != NULL) && - (target_ip != src_ip) && - (target_ip != src_dp) && - (target_ip != target_dp)) { - xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); - } - - /* - * Set up the target. - */ - if (target_ip == NULL) { - /* - * If there's no space reservation, check the entry will - * fit before actually inserting it. - */ - if (spaceres == 0 && - (error = xfs_dir_canenter(tp, target_dp, target_name, - target_namelen))) - goto error_return; - /* - * If target does not exist and the rename crosses - * directories, adjust the target directory link count - * to account for the ".." reference from the new entry. - */ - error = xfs_dir_createname(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error == ENOSPC) - goto error_return; - if (error) - goto abort_return; - xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - if (new_parent && src_is_directory) { - error = xfs_bumplink(tp, target_dp); - if (error) - goto abort_return; - } - } else { /* target_ip != NULL */ - /* - * If target exists and it's a directory, check that both - * target and source are directories and that target can be - * destroyed, or that neither is a directory. - */ - if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - /* - * Make sure target dir is empty. - */ - if (!(xfs_dir_isempty(target_ip)) || - (target_ip->i_d.di_nlink > 2)) { - error = XFS_ERROR(EEXIST); - goto error_return; - } - } - - /* - * Link the source inode under the target name. - * If the source inode is a directory and we are moving - * it across directories, its ".." entry will be - * inconsistent until we replace that down below. - * - * In case there is already an entry with the same - * name at the destination directory, remove it first. - */ - error = xfs_dir_replace(tp, target_dp, target_name, - target_namelen, src_ip->i_ino, - &first_block, &free_list, spaceres); - if (error) - goto abort_return; - xfs_ichgtime(target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Decrement the link count on the target since the target - * dir no longer points to it. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - target_ip_dropped = 1; - - if (src_is_directory) { - /* - * Drop the link from the old "." entry. - */ - error = xfs_droplink(tp, target_ip); - if (error) - goto abort_return; - } - - /* Do this test while we still hold the locks */ - target_link_zero = (target_ip)->i_d.di_nlink==0; - - } /* target_ip != NULL */ - - /* - * Remove the source. - */ - if (new_parent && src_is_directory) { - /* - * Rewrite the ".." entry to point to the new - * directory. - */ - error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino, - &first_block, &free_list, spaceres); - ASSERT(error != EEXIST); - if (error) - goto abort_return; - xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - } else { - /* - * We always want to hit the ctime on the source inode. - * We do it in the if clause above for the 'new_parent && - * src_is_directory' case, and here we get all the other - * cases. This isn't strictly required by the standards - * since the source inode isn't really being changed, - * but old unix file systems did it and some incremental - * backup programs won't work without it. - */ - xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG); - } - - /* - * Adjust the link count on src_dp. This is necessary when - * renaming a directory, either within one parent when - * the target existed, or across two parent directories. - */ - if (src_is_directory && (new_parent || target_ip != NULL)) { - - /* - * Decrement link count on src_directory since the - * entry that's moved no longer points to it. - */ - error = xfs_droplink(tp, src_dp); - if (error) - goto abort_return; - } - - error = xfs_dir_removename(tp, src_dp, src_name, src_namelen, - src_ip->i_ino, &first_block, &free_list, spaceres); - if (error) - goto abort_return; - xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Update the generation counts on all the directory inodes - * that we're modifying. - */ - src_dp->i_gen++; - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - - if (new_parent) { - target_dp->i_gen++; - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - } - - /* - * If there was a target inode, take an extra reference on - * it here so that it doesn't go to xfs_inactive() from - * within the commit. - */ - if (target_ip != NULL) { - IHOLD(target_ip); - } - - /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - /* - * Take refs. for vop_link_removed calls below. No need to worry - * about directory refs. because the caller holds them. - * - * Do holds before the xfs_bmap_finish since it might rele them down - * to zero. - */ - - if (target_ip_dropped) - IHOLD(target_ip); - IHOLD(src_ip); - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - if (target_ip != NULL) { - IRELE(target_ip); - } - if (target_ip_dropped) { - IRELE(target_ip); - } - IRELE(src_ip); - goto std_return; - } - - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (target_ip != NULL) { - xfs_refcache_purge_ip(target_ip); - IRELE(target_ip); - } - /* - * Let interposed file systems know about removed links. - */ - if (target_ip_dropped) { - bhv_vop_link_removed(XFS_ITOV(target_ip), target_dir_vp, - target_link_zero); - IRELE(target_ip); - } - - IRELE(src_ip); - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(src_dir_vp->v_vfsp, src_dp, DM_EVENT_POSTRENAME) || - DM_EVENT_ENABLED(target_dir_vp->v_vfsp, - target_dp, DM_EVENT_POSTRENAME)) { - (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, - src_dir_vp, DM_RIGHT_NULL, - target_dir_vp, DM_RIGHT_NULL, - src_name, target_name, - 0, error, 0); - } - return error; - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - error_return: - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - - rele_return: - IRELE(src_ip); - if (target_ip != NULL) { - IRELE(target_ip); - } - goto std_return; -} diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5a0b678956e..ec5ca65c621 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -17,186 +17,260 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" #include "xfs_bmap.h" -#include "xfs_rtalloc.h" -#include "xfs_fsops.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" #include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_inode_item.h" +#include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" /* - * Prototypes for internal functions. + * Read and return the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. */ +STATIC int /* error */ +xfs_rtget_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ +{ + xfs_buf_t *bp; /* buffer for summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information & copy it out. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sum = *sp; + /* + * Drop the buffer if we're not asked to remember it. + */ + if (!rbpp) + xfs_trans_brelse(tp, bp); + return 0; +} -STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *); -STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *); -STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int, xfs_rtblock_t *, int *); -STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_rtblock_t, xfs_rtblock_t *); -STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *); -STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t, - xfs_extlen_t, int); -STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int, - xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *); /* - * Internal functions. + * Return whether there are any free extents in the size range given + * by low and high, for the bitmap block bbno. */ +STATIC int /* error */ +xfs_rtany_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int low, /* low log2 extent size */ + int high, /* high log2 extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + int *stat) /* out: any good extents here? */ +{ + int error; /* error value */ + int log; /* loop counter, log2 of ext. size */ + xfs_suminfo_t sum; /* summary data */ + + /* + * Loop over logs of extent sizes. Order is irrelevant. + */ + for (log = low; log <= high; log++) { + /* + * Get one summary datum. + */ + error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); + if (error) { + return error; + } + /* + * If there are any, return success. + */ + if (sum) { + *stat = 1; + return 0; + } + } + /* + * Found nothing, return failure. + */ + *stat = 0; + return 0; +} + /* - * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set. + * Copy and transform the summary file, given the old and new + * parameters in the mount structures. */ -STATIC int -xfs_lowbit32( - __uint32_t v) +STATIC int /* error */ +xfs_rtcopy_summary( + xfs_mount_t *omp, /* old file system mount point */ + xfs_mount_t *nmp, /* new file system mount point */ + xfs_trans_t *tp) /* transaction pointer */ { - if (v) - return ffs(v) - 1; - return -1; -} + xfs_rtblock_t bbno; /* bitmap block number */ + xfs_buf_t *bp; /* summary buffer */ + int error; /* error return value */ + int log; /* summary level number (log length) */ + xfs_suminfo_t sum; /* summary data */ + xfs_fsblock_t sumbno; /* summary block number */ + bp = NULL; + for (log = omp->m_rsumlevels - 1; log >= 0; log--) { + for (bbno = omp->m_sb.sb_rbmblocks - 1; + (xfs_srtblock_t)bbno >= 0; + bbno--) { + error = xfs_rtget_summary(omp, tp, log, bbno, &bp, + &sumbno, &sum); + if (error) + return error; + if (sum == 0) + continue; + error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, + &bp, &sumbno); + if (error) + return error; + error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, + &bp, &sumbno); + if (error) + return error; + ASSERT(sum > 0); + } + } + return 0; +} /* - * Allocate space to the bitmap or summary file, and zero it, for growfs. + * Mark an extent specified by start and len allocated. + * Updates all the summary information as well as the bitmap. */ STATIC int /* error */ -xfs_growfs_rt_alloc( +xfs_rtallocate_range( xfs_mount_t *mp, /* file system mount point */ - xfs_extlen_t oblocks, /* old count of blocks */ - xfs_extlen_t nblocks, /* new count of blocks */ - xfs_ino_t ino) /* inode number (bitmap/summary) */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* start block to allocate */ + xfs_extlen_t len, /* length to allocate */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ { - xfs_fileoff_t bno; /* block number in file */ - xfs_buf_t *bp; /* temporary buffer for zeroing */ - int cancelflags; /* flags for xfs_trans_cancel */ - int committed; /* transaction committed flag */ - xfs_daddr_t d; /* disk block address */ - int error; /* error return value */ - xfs_fsblock_t firstblock; /* first block allocated in xaction */ - xfs_bmap_free_t flist; /* list of freed blocks */ - xfs_fsblock_t fsbno; /* filesystem block for bno */ - xfs_inode_t *ip; /* pointer to incore inode */ - xfs_bmbt_irec_t map; /* block map output */ - int nmap; /* number of block maps */ - int resblks; /* space reservation */ - xfs_trans_t *tp; /* transaction pointer */ + xfs_rtblock_t end; /* end of the allocated extent */ + int error; /* error value */ + xfs_rtblock_t postblock = 0; /* first block allocated > end */ + xfs_rtblock_t preblock = 0; /* first block allocated < start */ + end = start + len - 1; /* - * Allocate space to the file, as necessary. + * Assume we're allocating out of the middle of a free extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. */ - while (oblocks < nblocks) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); - resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); - cancelflags = 0; - /* - * Reserve space & log for one extent added to the file. - */ - if ((error = xfs_trans_reserve(tp, resblks, - XFS_GROWRTALLOC_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_DEFAULT_PERM_LOG_COUNT))) - goto error_exit; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; - /* - * Lock the inode. - */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, - XFS_ILOCK_EXCL, &ip))) - goto error_exit; - XFS_BMAP_INIT(&flist, &firstblock); - /* - * Allocate blocks to the bitmap file. - */ - nmap = 1; - cancelflags |= XFS_TRANS_ABORT; - error = xfs_bmapi(tp, ip, oblocks, nblocks - oblocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, &firstblock, - resblks, &map, &nmap, &flist, NULL); - if (!error && nmap < 1) - error = XFS_ERROR(ENOSPC); - if (error) - goto error_exit; - /* - * Free any blocks freed up in the transaction, then commit. - */ - error = xfs_bmap_finish(&tp, &flist, firstblock, &committed); - if (error) - goto error_exit; - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - /* - * Now we need to clear the allocated blocks. - * Do this one block per transaction, to keep it simple. - */ - cancelflags = 0; - for (bno = map.br_startoff, fsbno = map.br_startblock; - bno < map.br_startoff + map.br_blockcount; - bno++, fsbno++) { - tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); - /* - * Reserve log for one block zeroing. - */ - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) - goto error_exit; - /* - * Lock the bitmap inode. - */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, - XFS_ILOCK_EXCL, &ip))) - goto error_exit; - /* - * Get a buffer for the block. - */ - d = XFS_FSB_TO_DADDR(mp, fsbno); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - mp->m_bsize, 0); - if (bp == NULL) { - error = XFS_ERROR(EIO); - goto error_exit; - } - memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); - xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); - /* - * Commit the transaction. - */ - xfs_trans_commit(tp, 0, NULL); + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of free extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) { + return error; + } + /* + * Decrement the summary information corresponding to the entire + * (old) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + /* + * If there are blocks not being allocated at the front of the + * old extent, add summary data for them to be free. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + if (error) { + return error; } - /* - * Go on to the next extent, if any. - */ - oblocks = map.br_startoff + map.br_blockcount; } - return 0; -error_exit: - xfs_trans_cancel(tp, cancelflags); + /* + * If there are blocks not being allocated at the end of the + * old extent, add summary data for them to be free. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Modify the bitmap to mark this extent allocated. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 0); return error; } @@ -444,6 +518,7 @@ xfs_rtallocate_extent_near( } bbno = XFS_BITTOBLOCK(mp, bno); i = 0; + ASSERT(minlen != 0); log2len = xfs_highbit32(minlen); /* * Loop over all bitmap blocks (bbno + i is current block). @@ -612,6 +687,8 @@ xfs_rtallocate_extent_size( xfs_suminfo_t sum; /* summary information for extents */ ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(maxlen != 0); + /* * Loop over all the levels starting with maxlen. * At each level, look at all the bitmap blocks, to see if there @@ -669,6 +746,9 @@ xfs_rtallocate_extent_size( *rtblock = NULLRTBLOCK; return 0; } + ASSERT(minlen != 0); + ASSERT(maxlen != 0); + /* * Loop over sizes, from maxlen down to minlen. * This time, when we do the allocations, allow smaller ones @@ -729,1171 +809,126 @@ xfs_rtallocate_extent_size( } /* - * Mark an extent specified by start and len allocated. - * Updates all the summary information as well as the bitmap. + * Allocate space to the bitmap or summary file, and zero it, for growfs. */ STATIC int /* error */ -xfs_rtallocate_range( +xfs_growfs_rt_alloc( xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* start block to allocate */ - xfs_extlen_t len, /* length to allocate */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + xfs_extlen_t oblocks, /* old count of blocks */ + xfs_extlen_t nblocks, /* new count of blocks */ + xfs_inode_t *ip) /* inode (bitmap/summary) */ { - xfs_rtblock_t end; /* end of the allocated extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block allocated > end */ - xfs_rtblock_t preblock; /* first block allocated < start */ + xfs_fileoff_t bno; /* block number in file */ + xfs_buf_t *bp; /* temporary buffer for zeroing */ + int committed; /* transaction committed flag */ + xfs_daddr_t d; /* disk block address */ + int error; /* error return value */ + xfs_fsblock_t firstblock; /* first block allocated in xaction */ + xfs_bmap_free_t flist; /* list of freed blocks */ + xfs_fsblock_t fsbno; /* filesystem block for bno */ + xfs_bmbt_irec_t map; /* block map output */ + int nmap; /* number of block maps */ + int resblks; /* space reservation */ - end = start + len - 1; - /* - * Assume we're allocating out of the middle of a free extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of free extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - if (error) { - return error; - } /* - * Decrement the summary information corresponding to the entire - * (old) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - /* - * If there are blocks not being allocated at the front of the - * old extent, add summary data for them to be free. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * If there are blocks not being allocated at the end of the - * old extent, add summary data for them to be free. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * Modify the bitmap to mark this extent allocated. + * Allocate space to the file, as necessary. */ - error = xfs_rtmodify_range(mp, tp, start, len, 0); - return error; -} - -/* - * Return whether there are any free extents in the size range given - * by low and high, for the bitmap block bbno. - */ -STATIC int /* error */ -xfs_rtany_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int low, /* low log2 extent size */ - int high, /* high log2 extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - int *stat) /* out: any good extents here? */ -{ - int error; /* error value */ - int log; /* loop counter, log2 of ext. size */ - xfs_suminfo_t sum; /* summary data */ + while (oblocks < nblocks) { + int cancelflags = 0; + xfs_trans_t *tp; - /* - * Loop over logs of extent sizes. Order is irrelevant. - */ - for (log = low; log <= high; log++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); + resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks); /* - * Get one summary datum. + * Reserve space & log for one extent added to the file. */ - error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum); - if (error) { - return error; - } + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc, + resblks, 0); + if (error) + goto error_cancel; + cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* - * If there are any, return success. + * Lock the inode. */ - if (sum) { - *stat = 1; - return 0; - } - } - /* - * Found nothing, return failure. - */ - *stat = 0; - return 0; -} - -/* - * Get a buffer for the bitmap or summary file block specified. - * The buffer is returned read and locked. - */ -STATIC int /* error */ -xfs_rtbuf_get( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t block, /* block number in bitmap or summary */ - int issum, /* is summary not bitmap */ - xfs_buf_t **bpp) /* output: buffer for the block */ -{ - xfs_buf_t *bp; /* block buffer, result */ - xfs_daddr_t d; /* disk addr of block */ - int error; /* error value */ - xfs_fsblock_t fsb; /* fs block number for block */ - xfs_inode_t *ip; /* bitmap or summary inode */ - - ip = issum ? mp->m_rsumip : mp->m_rbmip; - /* - * Map from the file offset (block) and inode number to the - * file system block. - */ - error = xfs_bmapi_single(tp, ip, XFS_DATA_FORK, &fsb, block); - if (error) { - return error; - } - ASSERT(fsb != NULLFSBLOCK); - /* - * Convert to disk address for buffer cache. - */ - d = XFS_FSB_TO_DADDR(mp, fsb); - /* - * Read the buffer. - */ - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d, - mp->m_bsize, 0, &bp); - if (error) { - return error; - } - ASSERT(bp && !XFS_BUF_GETERROR(bp)); - *bpp = bp; - return 0; -} - -#ifdef DEBUG -/* - * Check that the given extent (block range) is allocated already. - */ -STATIC int /* error */ -xfs_rtcheck_alloc_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for allocated, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat); -} -#endif - -#ifdef DEBUG -/* - * Check whether the given block in the bitmap has the given value. - */ -STATIC int /* 1 for matches, 0 for not */ -xfs_rtcheck_bit( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* bit (block) to check */ - int val) /* 1 for free, 0 for allocated */ -{ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* pointer into the buffer */ - /* REFERENCED */ - int error; /* error value */ - xfs_rtword_t wdiff; /* difference between bit & expected */ - int word; /* word number in the buffer */ - xfs_rtword_t wval; /* word value from buffer */ - - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = XFS_BITTOWORD(mp, start); - bit = (int)(start & (XFS_NBWORD - 1)); - wval = bufp[word]; - xfs_trans_brelse(tp, bp); - wdiff = (wval ^ -val) & ((xfs_rtword_t)1 << bit); - return !wdiff; -} -#endif /* DEBUG */ - -#if 0 -/* - * Check that the given extent (block range) is free already. - */ -STATIC int /* error */ -xfs_rtcheck_free_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int *stat) /* out: 1 for free, 0 for not */ -{ - xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ - - return xfs_rtcheck_range(mp, tp, bno, len, 1, &new, stat); -} -#endif - -/* - * Check that the given range is either all allocated (val = 0) or - * all free (val = 1). - */ -STATIC int /* error */ -xfs_rtcheck_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block number of extent */ - xfs_extlen_t len, /* length of extent */ - int val, /* 1 for free, 0 for allocated */ - xfs_rtblock_t *new, /* out: first block not matching */ - int *stat) /* out: 1 for matches, 0 for not */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - /* - * Compute starting bitmap block number - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zero's; 1 (free) => all one's. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not examined. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - /* - * Mask of relevant bits. - */ - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ val) & mask)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *new = start + i; - *stat = 0; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { + xfs_bmap_init(&flist, &firstblock); /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ val)) { - /* - * Different, compute first wrong bit and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to next block if that's where the next word is - * and we need the next word. + * Allocate blocks to the bitmap file. */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the next one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { - /* - * Go on to the next word in the buffer. - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { + nmap = 1; + cancelflags |= XFS_TRANS_ABORT; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, &firstblock, + resblks, &map, &nmap, &flist); + if (!error && nmap < 1) + error = XFS_ERROR(ENOSPC); + if (error) + goto error_cancel; /* - * Mask of relevant bits. + * Free any blocks freed up in the transaction, then commit. */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; + error = xfs_bmap_finish(&tp, &flist, &committed); + if (error) + goto error_cancel; + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto error; /* - * Compute difference between actual and desired value. + * Now we need to clear the allocated blocks. + * Do this one block per transaction, to keep it simple. */ - if ((wdiff = (*b ^ val) & mask)) { + cancelflags = 0; + for (bno = map.br_startoff, fsbno = map.br_startblock; + bno < map.br_startoff + map.br_blockcount; + bno++, fsbno++) { + tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO); /* - * Different, compute first wrong bit and return. + * Reserve log for one block zeroing. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *new = start + i; - *stat = 0; - return 0; - } else - i = len; - } - /* - * Successful, return. - */ - xfs_trans_brelse(tp, bp); - *new = start + i; - *stat = 1; - return 0; -} - -/* - * Copy and transform the summary file, given the old and new - * parameters in the mount structures. - */ -STATIC int /* error */ -xfs_rtcopy_summary( - xfs_mount_t *omp, /* old file system mount point */ - xfs_mount_t *nmp, /* new file system mount point */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_rtblock_t bbno; /* bitmap block number */ - xfs_buf_t *bp; /* summary buffer */ - int error; /* error return value */ - int log; /* summary level number (log length) */ - xfs_suminfo_t sum; /* summary data */ - xfs_fsblock_t sumbno; /* summary block number */ - - bp = NULL; - for (log = omp->m_rsumlevels - 1; log >= 0; log--) { - for (bbno = omp->m_sb.sb_rbmblocks - 1; - (xfs_srtblock_t)bbno >= 0; - bbno--) { - error = xfs_rtget_summary(omp, tp, log, bbno, &bp, - &sumbno, &sum); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero, + 0, 0); if (error) - return error; - if (sum == 0) - continue; - error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum, - &bp, &sumbno); - if (error) - return error; - error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum, - &bp, &sumbno); - if (error) - return error; - ASSERT(sum > 0); - } - } - return 0; -} - -/* - * Searching backward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_back( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t firstbit; /* first useful bit in the word */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = start - limit + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit < XFS_NBWORD - 1) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); - mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << - firstbit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = bit - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i = bit - firstbit + 1; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the previous one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to previous block if that's where the previous word is - * and we need the previous word. - */ - if (--word == -1 && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = XFS_BLOCKWMASK(mp); - b = &bufp[word]; - } else { - /* - * Go on to the previous word in the buffer. - */ - b--; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if (len - i) { - /* - * Calculate first (leftmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - firstbit = XFS_NBWORD - (len - i); - mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); - *rtblock = start - i + 1; - return 0; - } else - i = len; - } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start - i + 1; - return 0; -} - -/* - * Searching forward from start to limit, find the first block whose - * allocated/free state is different from start's. - */ -STATIC int /* error */ -xfs_rtfind_forw( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to look at */ - xfs_rtblock_t limit, /* last block to look at */ - xfs_rtblock_t *rtblock) /* out: start block found */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtblock_t i; /* current bit number rel. to start */ - xfs_rtblock_t lastbit; /* last useful bit in the word */ - xfs_rtblock_t len; /* length of inspected area */ - xfs_rtword_t mask; /* mask of relevant bits for value */ - xfs_rtword_t want; /* mask for "good" values */ - xfs_rtword_t wdiff; /* difference from wanted value */ - int word; /* word number in the buffer */ - - /* - * Compute and read in starting bitmap block for starting block. - */ - block = XFS_BITTOBLOCK(mp, start); - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - /* - * Get the first word's index & point to it. - */ - word = XFS_BITTOWORD(mp, start); - b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - len = limit - start + 1; - /* - * Compute match value, based on the bit at start: if 1 (free) - * then all-ones, else all-zeroes. - */ - want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; - /* - * If the starting position is not word-aligned, deal with the - * partial word. - */ - if (bit) { - /* - * Calculate last (rightmost) bit number to look at, - * and mask for all the relevant bits in this word. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Calculate the difference between the value there - * and what we're looking for. - */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different. Mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i = XFS_RTLOBIT(wdiff) - bit; - *rtblock = start + i - 1; - return 0; - } - i = lastbit - bit; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * If done with this block, get the previous one. - */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { - /* - * Go on to the previous word in the buffer. - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Compute difference between actual and desired value. - */ - if ((wdiff = *b ^ want)) { + goto error_cancel; /* - * Different, mark where we are and return. + * Lock the bitmap inode. */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } - i += XFS_NBWORD; - /* - * Go on to next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* - * If done with this block, get the next one. + * Get a buffer for the block. */ - xfs_trans_brelse(tp, bp); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; + d = XFS_FSB_TO_DADDR(mp, fsbno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + mp->m_bsize, 0); + if (bp == NULL) { + error = XFS_ERROR(EIO); +error_cancel: + xfs_trans_cancel(tp, cancelflags); + goto error; } - b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { + memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); + xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* - * Go on to the next word in the buffer. + * Commit the transaction. */ - b++; + error = xfs_trans_commit(tp, 0); + if (error) + goto error; } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { /* - * Calculate mask for all the relevant bits in this word. - */ - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Compute difference between actual and desired value. + * Go on to the next extent, if any. */ - if ((wdiff = (*b ^ want) & mask)) { - /* - * Different, mark where we are and return. - */ - xfs_trans_brelse(tp, bp); - i += XFS_RTLOBIT(wdiff); - *rtblock = start + i - 1; - return 0; - } else - i = len; + oblocks = map.br_startoff + map.br_blockcount; } - /* - * No match, return that we scanned the whole area. - */ - xfs_trans_brelse(tp, bp); - *rtblock = start + i - 1; return 0; -} -/* - * Mark an extent specified by start and len freed. - * Updates all the summary information as well as the bitmap. - */ -STATIC int /* error */ -xfs_rtfree_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to free */ - xfs_extlen_t len, /* length to free */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_rtblock_t end; /* end of the freed extent */ - int error; /* error value */ - xfs_rtblock_t postblock; /* first block freed > end */ - xfs_rtblock_t preblock; /* first block freed < start */ - - end = start + len - 1; - /* - * Modify the bitmap to mark this extent freed. - */ - error = xfs_rtmodify_range(mp, tp, start, len, 1); - if (error) { - return error; - } - /* - * Assume we're freeing out of the middle of an allocated extent. - * We need to find the beginning and end of the extent so we can - * properly update the summary. - */ - error = xfs_rtfind_back(mp, tp, start, 0, &preblock); - if (error) { - return error; - } - /* - * Find the next allocated block (end of allocated extent). - */ - error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, - &postblock); - /* - * If there are blocks not being freed at the front of the - * old extent, add summary data for them to be allocated. - */ - if (preblock < start) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(start - preblock), - XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * If there are blocks not being freed at the end of the - * old extent, add summary data for them to be allocated. - */ - if (postblock > end) { - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock - end), - XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); - if (error) { - return error; - } - } - /* - * Increment the summary information corresponding to the entire - * (new) free extent. - */ - error = xfs_rtmodify_summary(mp, tp, - XFS_RTBLOCKLOG(postblock + 1 - preblock), - XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); +error: return error; } /* - * Read and return the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtget_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb, /* in/out: summary block number */ - xfs_suminfo_t *sum) /* out: summary info for this block */ -{ - xfs_buf_t *bp; /* buffer for summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information & copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sum = *sp; - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); - return 0; -} - -/* - * Set the given range of bitmap bits to the given value. - * Do whatever I/O and logging is required. - */ -STATIC int /* error */ -xfs_rtmodify_range( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to modify */ - xfs_extlen_t len, /* length of extent to modify */ - int val) /* 1 for free, 0 for allocated */ -{ - xfs_rtword_t *b; /* current word in buffer */ - int bit; /* bit number in the word */ - xfs_rtblock_t block; /* bitmap block number */ - xfs_buf_t *bp; /* buf for the block */ - xfs_rtword_t *bufp; /* starting word in buffer */ - int error; /* error value */ - xfs_rtword_t *first; /* first used word in the buffer */ - int i; /* current bit number rel. to start */ - int lastbit; /* last useful bit in word */ - xfs_rtword_t mask; /* mask o frelevant bits for value */ - int word; /* word number in the buffer */ - - /* - * Compute starting bitmap block number. - */ - block = XFS_BITTOBLOCK(mp, start); - /* - * Read the bitmap block, and point to its data. - */ - error = xfs_rtbuf_get(mp, tp, block, 0, &bp); - if (error) { - return error; - } - bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - /* - * Compute the starting word's address, and starting bit. - */ - word = XFS_BITTOWORD(mp, start); - first = b = &bufp[word]; - bit = (int)(start & (XFS_NBWORD - 1)); - /* - * 0 (allocated) => all zeroes; 1 (free) => all ones. - */ - val = -val; - /* - * If not starting on a word boundary, deal with the first - * (partial) word. - */ - if (bit) { - /* - * Compute first bit not changed and mask of relevant bits. - */ - lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); - mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - i = lastbit - bit; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } else { - /* - * Starting on a word boundary, no partial word. - */ - i = 0; - } - /* - * Loop over whole words in buffers. When we use up one buffer - * we move on to the next one. - */ - while (len - i >= XFS_NBWORD) { - /* - * Set the word value correctly. - */ - *b = val; - i += XFS_NBWORD; - /* - * Go on to the next block if that's where the next word is - * and we need the next word. - */ - if (++word == XFS_BLOCKWSIZE(mp) && i < len) { - /* - * Log the changed part of this block. - * Get the next one. - */ - xfs_trans_log_buf(tp, bp, - (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp)); - error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); - if (error) { - return error; - } - first = b = bufp = (xfs_rtword_t *)XFS_BUF_PTR(bp); - word = 0; - } else { - /* - * Go on to the next word in the buffer - */ - b++; - } - } - /* - * If not ending on a word boundary, deal with the last - * (partial) word. - */ - if ((lastbit = len - i)) { - /* - * Compute a mask of relevant bits. - */ - bit = 0; - mask = ((xfs_rtword_t)1 << lastbit) - 1; - /* - * Set/clear the active bits. - */ - if (val) - *b |= mask; - else - *b &= ~mask; - b++; - } - /* - * Log any remaining changed bytes. - */ - if (b > first) - xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), - (uint)((char *)b - (char *)bufp - 1)); - return 0; -} - -/* - * Read and modify the summary information for a given extent size, - * bitmap block combination. - * Keeps track of a current summary block, so we don't keep reading - * it from the buffer cache. - */ -STATIC int /* error */ -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - int log, /* log2 of extent size */ - xfs_rtblock_t bbno, /* bitmap block number */ - int delta, /* change to make to summary info */ - xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ -{ - xfs_buf_t *bp; /* buffer for the summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information, modify and log it. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)XFS_BUF_PTR(bp)), - (uint)((char *)sp - (char *)XFS_BUF_PTR(bp) + sizeof(*sp) - 1)); - return 0; -} - -/* * Visible (exported) functions. */ @@ -1907,9 +942,7 @@ xfs_growfs_rt( { xfs_rtblock_t bmbno; /* bitmap block number */ xfs_buf_t *bp; /* temporary buffer */ - int cancelflags; /* flags for xfs_trans_cancel */ int error; /* error return value */ - xfs_inode_t *ip; /* bitmap inode, used as lock */ xfs_mount_t *nmp; /* new (fake) mount structure */ xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ @@ -1923,32 +956,40 @@ xfs_growfs_rt( xfs_extlen_t rsumblocks; /* current number of rt summary blks */ xfs_sb_t *sbp; /* old superblock */ xfs_fsblock_t sumbno; /* summary block number */ - xfs_trans_t *tp; /* transaction pointer */ sbp = &mp->m_sb; /* * Initial error checking. */ + if (!capable(CAP_SYS_ADMIN)) + return XFS_ERROR(EPERM); if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; /* * Read in the last block of the device, make sure it exists. */ - error = xfs_read_buf(mp, mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (error) + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, nrblocks - 1), + XFS_FSB_TO_BB(mp, 1), 0, NULL); + if (!bp) + return EIO; + if (bp->b_error) { + error = bp->b_error; + xfs_buf_relse(bp); return error; - ASSERT(bp); + } xfs_buf_relse(bp); + /* * Calculate new parameters. These are the final values to be reached. */ nrextents = nrblocks; do_div(nrextents, in->extsize); - nrbmblocks = roundup_64(nrextents, NBBY * sbp->sb_blocksize); + nrbmblocks = howmany_64(nrextents, NBBY * sbp->sb_blocksize); nrextslog = xfs_highbit32(nrextents); nrsumlevels = nrextslog + 1; nrsumsize = (uint)sizeof(xfs_suminfo_t) * nrsumlevels * nrbmblocks; @@ -1970,13 +1011,16 @@ xfs_growfs_rt( /* * Allocate space to the bitmap and summary files, as necessary. */ - if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, - mp->m_sb.sb_rbmino))) + error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip); + if (error) return error; - if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, - mp->m_sb.sb_rsumino))) + error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip); + if (error) return error; - nmp = NULL; + /* + * Allocate a new (fake) mount/sb. + */ + nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); /* * Loop over the bitmap blocks. * We will do everything one bitmap block at a time. @@ -1987,10 +1031,9 @@ xfs_growfs_rt( ((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0); bmbno < nrbmblocks; bmbno++) { - /* - * Allocate a new (fake) mount/sb. - */ - nmp = kmem_alloc(sizeof(*nmp), KM_SLEEP); + xfs_trans_t *tp; + int cancelflags = 0; + *nmp = *mp; nsbp = &nmp->m_sb; /* @@ -2004,6 +1047,7 @@ xfs_growfs_rt( nsbp->sb_blocksize * nsbp->sb_rextsize); nsbp->sb_rextents = nsbp->sb_rblocks; do_div(nsbp->sb_rextents, nsbp->sb_rextsize); + ASSERT(nsbp->sb_rextents != 0); nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents); nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1; nrsumsize = @@ -2015,17 +1059,15 @@ xfs_growfs_rt( * Start a transaction, get the log reservation. */ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE); - cancelflags = 0; - if ((error = xfs_trans_reserve(tp, 0, - XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0))) - goto error_exit; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtfree, + 0, 0); + if (error) + goto error_cancel; /* * Lock out other callers by grabbing the bitmap inode lock. */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, - XFS_ILOCK_EXCL, &ip))) - goto error_exit; - ASSERT(ip == mp->m_rbmip); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* * Update the bitmap inode's size. */ @@ -2036,10 +1078,8 @@ xfs_growfs_rt( /* * Get the summary inode into the transaction. */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, - XFS_ILOCK_EXCL, &ip))) - goto error_exit; - ASSERT(ip == mp->m_rsumip); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* * Update the summary inode's size. */ @@ -2053,7 +1093,7 @@ xfs_growfs_rt( mp->m_rsumlevels != nmp->m_rsumlevels) { error = xfs_rtcopy_summary(mp, nmp, tp); if (error) - goto error_exit; + goto error_cancel; } /* * Update superblock fields. @@ -2079,37 +1119,32 @@ xfs_growfs_rt( bp = NULL; error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents, nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); - if (error) - goto error_exit; + if (error) { +error_cancel: + xfs_trans_cancel(tp, cancelflags); + break; + } /* * Mark more blocks free in the superblock. */ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, nsbp->sb_rextents - sbp->sb_rextents); /* - * Free the fake mp structure. - */ - kmem_free(nmp, sizeof(*nmp)); - nmp = NULL; - /* * Update mp values into the real mp structure. */ mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - /* - * Commit the transaction. - */ - xfs_trans_commit(tp, 0, NULL); + + error = xfs_trans_commit(tp, 0); + if (error) + break; } - return 0; /* - * Error paths come here. + * Free the fake mp structure. */ -error_exit: - if (nmp) - kmem_free(nmp, sizeof(*nmp)); - xfs_trans_cancel(tp, cancelflags); + kmem_free(nmp); + return error; } @@ -2130,15 +1165,15 @@ xfs_rtallocate_extent( xfs_extlen_t prod, /* extent product factor */ xfs_rtblock_t *rtblock) /* out: start block allocated */ { + xfs_mount_t *mp = tp->t_mountp; int error; /* error value */ - xfs_inode_t *ip; /* inode for bitmap file */ - xfs_mount_t *mp; /* file system mount structure */ xfs_rtblock_t r; /* result allocated block */ xfs_fsblock_t sb; /* summary file block number */ xfs_buf_t *sumbp; /* summary file block buffer */ + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); ASSERT(minlen > 0 && minlen <= maxlen); - mp = tp->t_mountp; + /* * If prod is set then figure out what to do to minlen and maxlen. */ @@ -2154,12 +1189,7 @@ xfs_rtallocate_extent( return 0; } } - /* - * Lock out other callers by grabbing the bitmap inode lock. - */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, - XFS_ILOCK_EXCL, &ip))) - return error; + sumbp = NULL; /* * Allocate by size, or near another block, or exactly at some block. @@ -2178,11 +1208,12 @@ xfs_rtallocate_extent( len, &sumbp, &sb, prod, &r); break; default: + error = EIO; ASSERT(0); } - if (error) { + if (error) return error; - } + /* * If it worked, update the superblock. */ @@ -2200,69 +1231,6 @@ xfs_rtallocate_extent( } /* - * Free an extent in the realtime subvolume. Length is expressed in - * realtime extents, as is the block number. - */ -int /* error */ -xfs_rtfree_extent( - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t bno, /* starting block number to free */ - xfs_extlen_t len) /* length of extent freed */ -{ - int error; /* error value */ - xfs_inode_t *ip; /* bitmap file inode */ - xfs_mount_t *mp; /* file system mount structure */ - xfs_fsblock_t sb; /* summary file block number */ - xfs_buf_t *sumbp; /* summary file block buffer */ - - mp = tp->t_mountp; - /* - * Synchronize by locking the bitmap inode. - */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, - XFS_ILOCK_EXCL, &ip))) - return error; -#if defined(__KERNEL__) && defined(DEBUG) - /* - * Check to see that this whole range is currently allocated. - */ - { - int stat; /* result from checking range */ - - error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat); - if (error) { - return error; - } - ASSERT(stat); - } -#endif - sumbp = NULL; - /* - * Free the range of realtime blocks. - */ - error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); - if (error) { - return error; - } - /* - * Mark more blocks free in the superblock. - */ - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); - /* - * If we've now freed all the blocks, reset the file sequence - * number to 0. - */ - if (tp->t_frextents_delta + mp->m_sb.sb_frextents == - mp->m_sb.sb_rextents) { - if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) - ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; - *(__uint64_t *)&ip->i_d.di_atime = 0; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - return 0; -} - -/* * Initialize realtime fields in the mount structure. */ int /* error */ @@ -2271,15 +1239,14 @@ xfs_rtmount_init( { xfs_buf_t *bp; /* buffer for last block of subvolume */ xfs_daddr_t d; /* address of last block of subvolume */ - int error; /* error return value */ xfs_sb_t *sbp; /* filesystem superblock copy in mount */ sbp = &mp->m_sb; if (sbp->sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { - cmn_err(CE_WARN, - "XFS: This filesystem has a realtime volume, use rtdev=device option"); + xfs_warn(mp, + "Filesystem has a realtime volume, use rtdev=device option"); return XFS_ERROR(ENODEV); } mp->m_rsumlevels = sbp->sb_rextslog + 1; @@ -2293,20 +1260,19 @@ xfs_rtmount_init( */ d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", + xfs_warn(mp, "realtime mount -- %llu != %llu", (unsigned long long) XFS_BB_TO_FSB(mp, d), (unsigned long long) mp->m_sb.sb_rblocks); - return XFS_ERROR(E2BIG); + return XFS_ERROR(EFBIG); } - error = xfs_read_buf(mp, mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp); - if (error) { - cmn_err(CE_WARN, - "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); - if (error == ENOSPC) - return XFS_ERROR(E2BIG); - return error; + bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + d - XFS_FSB_TO_BB(mp, 1), + XFS_FSB_TO_BB(mp, 1), 0, NULL); + if (!bp || bp->b_error) { + xfs_warn(mp, "realtime device size check failed"); + if (bp) + xfs_buf_relse(bp); + return EIO; } xfs_buf_relse(bp); return 0; @@ -2326,20 +1292,30 @@ xfs_rtmount_inodes( sbp = &mp->m_sb; if (sbp->sb_rbmino == NULLFSINO) return 0; - error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip); if (error) return error; ASSERT(mp->m_rbmip != NULL); ASSERT(sbp->sb_rsumino != NULLFSINO); - error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); + error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip); if (error) { - VN_RELE(XFS_ITOV(mp->m_rbmip)); + IRELE(mp->m_rbmip); return error; } ASSERT(mp->m_rsumip != NULL); return 0; } +void +xfs_rtunmount_inodes( + struct xfs_mount *mp) +{ + if (mp->m_rbmip) + IRELE(mp->m_rbmip); + if (mp->m_rsumip) + IRELE(mp->m_rsumip); +} + /* * Pick an extent for allocation at the start of a new realtime file. * Use the sequence number stored in the atime field of the bitmap inode. @@ -2355,20 +1331,16 @@ xfs_rtpick_extent( xfs_rtblock_t *pick) /* result rt extent */ { xfs_rtblock_t b; /* result block */ - int error; /* error return value */ - xfs_inode_t *ip; /* bitmap incore inode */ int log2; /* log of sequence number */ __uint64_t resid; /* residual after log removed */ __uint64_t seq; /* sequence number of file creation */ __uint64_t *seqp; /* pointer to seqno in inode */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, - XFS_ILOCK_EXCL, &ip))) - return error; - ASSERT(ip == mp->m_rbmip); - seqp = (__uint64_t *)&ip->i_d.di_atime; - if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { - ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime; + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) { + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; *seqp = 0; } seq = *seqp; @@ -2384,64 +1356,7 @@ xfs_rtpick_extent( b = mp->m_sb.sb_rextents - len; } *seqp = seq + 1; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; return 0; } - -#ifdef DEBUG -/* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len) /* length to print */ -{ - xfs_extlen_t i; /* block number in the extent */ - - cmn_err(CE_DEBUG, "%Ld: ", (long long)start); - for (i = 0; i < len; i++) - cmn_err(CE_DEBUG, "%d", xfs_rtcheck_bit(mp, tp, start + i, 1)); - cmn_err(CE_DEBUG, "\n"); -} - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - xfs_mount_t *mp, /* file system mount structure */ - xfs_trans_t *tp) /* transaction pointer */ -{ - xfs_suminfo_t c; /* summary data */ - xfs_rtblock_t i; /* bitmap block number */ - int l; /* summary information level */ - int p; /* flag for printed anything */ - xfs_fsblock_t sb; /* summary block number */ - xfs_buf_t *sumbp; /* summary block buffer */ - - sumbp = NULL; - for (l = 0; l < mp->m_rsumlevels; l++) { - for (p = 0, i = 0; i < mp->m_sb.sb_rbmblocks; i++) { - (void)xfs_rtget_summary(mp, tp, l, i, &sumbp, &sb, &c); - if (c) { - if (!p) { - cmn_err(CE_DEBUG, "%Ld-%Ld:", 1LL << l, - XFS_RTMIN((1LL << l) + - ((1LL << l) - 1LL), - mp->m_sb.sb_rextents)); - p = 1; - } - cmn_err(CE_DEBUG, " %Ld:%d", (long long)i, c); - } - } - if (p) - cmn_err(CE_DEBUG, "\n"); - } - if (sumbp) - xfs_trans_brelse(tp, sumbp); -} -#endif /* DEBUG */ diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0e0b4d2ec20..752b63d1030 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -18,60 +18,11 @@ #ifndef __XFS_RTALLOC_H__ #define __XFS_RTALLOC_H__ +/* kernel only definitions and functions */ + struct xfs_mount; struct xfs_trans; -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) - -/* Min and max rt extent sizes, specified in bytes */ -#define XFS_MAX_RTEXTSIZE (1024 * 1024 * 1024) /* 1GB */ -#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64KB */ -#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4KB */ - -/* - * Constants for bit manipulations. - */ -#define XFS_NBBYLOG 3 /* log2(NBBY) */ -#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ -#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) -#define XFS_NBWORD (1 << XFS_NBWORDLOG) -#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) - -#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize) -#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask) -#define XFS_BLOCKWSIZE(mp) ((mp)->m_blockwsize) -#define XFS_BLOCKWMASK(mp) ((mp)->m_blockwmask) - -/* - * Summary and bit manipulation macros. - */ -#define XFS_SUMOFFS(mp,ls,bb) ((int)((ls) * (mp)->m_sb.sb_rbmblocks + (bb))) -#define XFS_SUMOFFSTOBLOCK(mp,s) \ - (((s) * (uint)sizeof(xfs_suminfo_t)) >> (mp)->m_sb.sb_blocklog) -#define XFS_SUMPTR(mp,bp,so) \ - ((xfs_suminfo_t *)((char *)XFS_BUF_PTR(bp) + \ - (((so) * (uint)sizeof(xfs_suminfo_t)) & XFS_BLOCKMASK(mp)))) - -#define XFS_BITTOBLOCK(mp,bi) ((bi) >> (mp)->m_blkbit_log) -#define XFS_BLOCKTOBIT(mp,bb) ((bb) << (mp)->m_blkbit_log) -#define XFS_BITTOWORD(mp,bi) \ - ((int)(((bi) >> XFS_NBWORDLOG) & XFS_BLOCKWMASK(mp))) - -#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b)) -#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b)) - -#define XFS_RTLOBIT(w) xfs_lowbit32(w) -#define XFS_RTHIBIT(w) xfs_highbit32(w) - -#if XFS_BIG_BLKNOS -#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) -#else -#define XFS_RTBLOCKLOG(b) xfs_highbit32(b) -#endif - - -#ifdef __KERNEL__ - #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -110,6 +61,9 @@ xfs_rtfree_extent( int /* error */ xfs_rtmount_init( struct xfs_mount *mp); /* file system mount structure */ +void +xfs_rtunmount_inodes( + struct xfs_mount *mp); /* * Get the bitmap and summary inodes into the mount structure @@ -134,24 +88,6 @@ xfs_rtpick_extent( xfs_rtblock_t *pick); /* result rt extent */ /* - * Debug code: print out the value of a range in the bitmap. - */ -void -xfs_rtprint_range( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp, /* transaction pointer */ - xfs_rtblock_t start, /* starting block to print */ - xfs_extlen_t len); /* length to print */ - -/* - * Debug code: print the summary file. - */ -void -xfs_rtprint_summary( - struct xfs_mount *mp, /* file system mount structure */ - struct xfs_trans *tp); /* transaction pointer */ - -/* * Grow the realtime area of the filesystem. */ int @@ -159,15 +95,47 @@ xfs_growfs_rt( struct xfs_mount *mp, /* file system mount structure */ xfs_growfs_rt_t *in); /* user supplied growfs struct */ +/* + * From xfs_rtbitmap.c + */ +int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t block, int issum, struct xfs_buf **bpp); +int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val, + xfs_rtblock_t *new, int *stat); +int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_rtblock_t limit, + xfs_rtblock_t *rtblock); +int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, + xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, + xfs_fsblock_t *rsb); +int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, + xfs_rtblock_t start, xfs_extlen_t len, + struct xfs_buf **rbpp, xfs_fsblock_t *rsb); + + #else # define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +static inline int /* error */ +xfs_rtmount_init( + xfs_mount_t *mp) /* file system mount structure */ +{ + if (mp->m_sb.sb_rblocks == 0) + return 0; + + xfs_warn(mp, "Not built with CONFIG_XFS_RT"); + return ENOSYS; +} # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ -#endif /* __KERNEL__ */ - #endif /* __XFS_RTALLOC_H__ */ diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c new file mode 100644 index 00000000000..f4dd697cac0 --- /dev/null +++ b/fs/xfs/xfs_rtbitmap.c @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_bmap.h" +#include "xfs_bmap_util.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trans.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_buf.h" +#include "xfs_icache.h" +#include "xfs_dinode.h" +#include "xfs_rtalloc.h" + + +/* + * Realtime allocator bitmap functions shared with userspace. + */ + +/* + * Get a buffer for the bitmap or summary file block specified. + * The buffer is returned read and locked. + */ +int +xfs_rtbuf_get( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t block, /* block number in bitmap or summary */ + int issum, /* is summary not bitmap */ + xfs_buf_t **bpp) /* output: buffer for the block */ +{ + xfs_buf_t *bp; /* block buffer, result */ + xfs_inode_t *ip; /* bitmap or summary inode */ + xfs_bmbt_irec_t map; + int nmap = 1; + int error; /* error value */ + + ip = issum ? mp->m_rsumip : mp->m_rbmip; + + error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK); + if (error) + return error; + + ASSERT(map.br_startblock != NULLFSBLOCK); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, map.br_startblock), + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + *bpp = bp; + return 0; +} + +/* + * Searching backward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_back( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t firstbit; /* first useful bit in the word */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = start - limit + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit < XFS_NBWORD - 1) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0); + mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) << + firstbit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = bit - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i = bit - firstbit + 1; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the previous one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to previous block if that's where the previous word is + * and we need the previous word. + */ + if (--word == -1 && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, --block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + word = XFS_BLOCKWMASK(mp); + b = &bufp[word]; + } else { + /* + * Go on to the previous word in the buffer. + */ + b--; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if (len - i) { + /* + * Calculate first (leftmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + firstbit = XFS_NBWORD - (len - i); + mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff); + *rtblock = start - i + 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start - i + 1; + return 0; +} + +/* + * Searching forward from start to limit, find the first block whose + * allocated/free state is different from start's. + */ +int +xfs_rtfind_forw( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to look at */ + xfs_rtblock_t limit, /* last block to look at */ + xfs_rtblock_t *rtblock) /* out: start block found */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in the word */ + xfs_rtblock_t len; /* length of inspected area */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t want; /* mask for "good" values */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute and read in starting bitmap block for starting block. + */ + block = XFS_BITTOBLOCK(mp, start); + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Get the first word's index & point to it. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + len = limit - start + 1; + /* + * Compute match value, based on the bit at start: if 1 (free) + * then all-ones, else all-zeroes. + */ + want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0; + /* + * If the starting position is not word-aligned, deal with the + * partial word. + */ + if (bit) { + /* + * Calculate last (rightmost) bit number to look at, + * and mask for all the relevant bits in this word. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Calculate the difference between the value there + * and what we're looking for. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different. Mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *rtblock = start + i - 1; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the previous one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the previous word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ want)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Calculate mask for all the relevant bits in this word. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ want) & mask)) { + /* + * Different, mark where we are and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *rtblock = start + i - 1; + return 0; + } else + i = len; + } + /* + * No match, return that we scanned the whole area. + */ + xfs_trans_brelse(tp, bp); + *rtblock = start + i - 1; + return 0; +} + +/* + * Read and modify the summary information for a given extent size, + * bitmap block combination. + * Keeps track of a current summary block, so we don't keep reading + * it from the buffer cache. + */ +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_buf_t *bp; /* buffer for the summary block */ + int error; /* error value */ + xfs_fsblock_t sb; /* summary fsblock */ + int so; /* index into the summary file */ + xfs_suminfo_t *sp; /* pointer to returned data */ + + /* + * Compute entry number in the summary file. + */ + so = XFS_SUMOFFS(mp, log, bbno); + /* + * Compute the block number in the summary file. + */ + sb = XFS_SUMOFFSTOBLOCK(mp, so); + /* + * If we have an old buffer, and the block number matches, use that. + */ + if (rbpp && *rbpp && *rsb == sb) + bp = *rbpp; + /* + * Otherwise we have to get the buffer. + */ + else { + /* + * If there was an old one, get rid of it first. + */ + if (rbpp && *rbpp) + xfs_trans_brelse(tp, *rbpp); + error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); + if (error) { + return error; + } + /* + * Remember this buffer and block for the next call. + */ + if (rbpp) { + *rbpp = bp; + *rsb = sb; + } + } + /* + * Point to the summary information, modify and log it. + */ + sp = XFS_SUMPTR(mp, bp, so); + *sp += delta; + xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), + (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); + return 0; +} + +/* + * Set the given range of bitmap bits to the given value. + * Do whatever I/O and logging is required. + */ +int +xfs_rtmodify_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to modify */ + xfs_extlen_t len, /* length of extent to modify */ + int val) /* 1 for free, 0 for allocated */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtword_t *first; /* first used word in the buffer */ + int i; /* current bit number rel. to start */ + int lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask o frelevant bits for value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number. + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block, and point to its data. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + first = b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zeroes; 1 (free) => all ones. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not changed and mask of relevant bits. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + i = lastbit - bit; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Set the word value correctly. + */ + *b = val; + i += XFS_NBWORD; + /* + * Go on to the next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * Log the changed part of this block. + * Get the next one. + */ + xfs_trans_log_buf(tp, bp, + (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp)); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + first = b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Compute a mask of relevant bits. + */ + bit = 0; + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Set/clear the active bits. + */ + if (val) + *b |= mask; + else + *b &= ~mask; + b++; + } + /* + * Log any remaining changed bytes. + */ + if (b > first) + xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp), + (uint)((char *)b - (char *)bufp - 1)); + return 0; +} + +/* + * Mark an extent specified by start and len freed. + * Updates all the summary information as well as the bitmap. + */ +int +xfs_rtfree_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block to free */ + xfs_extlen_t len, /* length to free */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + xfs_rtblock_t end; /* end of the freed extent */ + int error; /* error value */ + xfs_rtblock_t postblock; /* first block freed > end */ + xfs_rtblock_t preblock; /* first block freed < start */ + + end = start + len - 1; + /* + * Modify the bitmap to mark this extent freed. + */ + error = xfs_rtmodify_range(mp, tp, start, len, 1); + if (error) { + return error; + } + /* + * Assume we're freeing out of the middle of an allocated extent. + * We need to find the beginning and end of the extent so we can + * properly update the summary. + */ + error = xfs_rtfind_back(mp, tp, start, 0, &preblock); + if (error) { + return error; + } + /* + * Find the next allocated block (end of allocated extent). + */ + error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, + &postblock); + if (error) + return error; + /* + * If there are blocks not being freed at the front of the + * old extent, add summary data for them to be allocated. + */ + if (preblock < start) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(start - preblock), + XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * If there are blocks not being freed at the end of the + * old extent, add summary data for them to be allocated. + */ + if (postblock > end) { + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock - end), + XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb); + if (error) { + return error; + } + } + /* + * Increment the summary information corresponding to the entire + * (new) free extent. + */ + error = xfs_rtmodify_summary(mp, tp, + XFS_RTBLOCKLOG(postblock + 1 - preblock), + XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb); + return error; +} + +/* + * Check that the given range is either all allocated (val = 0) or + * all free (val = 1). + */ +int +xfs_rtcheck_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t start, /* starting block number of extent */ + xfs_extlen_t len, /* length of extent */ + int val, /* 1 for free, 0 for allocated */ + xfs_rtblock_t *new, /* out: first block not matching */ + int *stat) /* out: 1 for matches, 0 for not */ +{ + xfs_rtword_t *b; /* current word in buffer */ + int bit; /* bit number in the word */ + xfs_rtblock_t block; /* bitmap block number */ + xfs_buf_t *bp; /* buf for the block */ + xfs_rtword_t *bufp; /* starting word in buffer */ + int error; /* error value */ + xfs_rtblock_t i; /* current bit number rel. to start */ + xfs_rtblock_t lastbit; /* last useful bit in word */ + xfs_rtword_t mask; /* mask of relevant bits for value */ + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + + /* + * Compute starting bitmap block number + */ + block = XFS_BITTOBLOCK(mp, start); + /* + * Read the bitmap block. + */ + error = xfs_rtbuf_get(mp, tp, block, 0, &bp); + if (error) { + return error; + } + bufp = bp->b_addr; + /* + * Compute the starting word's address, and starting bit. + */ + word = XFS_BITTOWORD(mp, start); + b = &bufp[word]; + bit = (int)(start & (XFS_NBWORD - 1)); + /* + * 0 (allocated) => all zero's; 1 (free) => all one's. + */ + val = -val; + /* + * If not starting on a word boundary, deal with the first + * (partial) word. + */ + if (bit) { + /* + * Compute first bit not examined. + */ + lastbit = XFS_RTMIN(bit + len, XFS_NBWORD); + /* + * Mask of relevant bits. + */ + mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i = XFS_RTLOBIT(wdiff) - bit; + *new = start + i; + *stat = 0; + return 0; + } + i = lastbit - bit; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } else { + /* + * Starting on a word boundary, no partial word. + */ + i = 0; + } + /* + * Loop over whole words in buffers. When we use up one buffer + * we move on to the next one. + */ + while (len - i >= XFS_NBWORD) { + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = *b ^ val)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } + i += XFS_NBWORD; + /* + * Go on to next block if that's where the next word is + * and we need the next word. + */ + if (++word == XFS_BLOCKWSIZE(mp) && i < len) { + /* + * If done with this block, get the next one. + */ + xfs_trans_brelse(tp, bp); + error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp); + if (error) { + return error; + } + b = bufp = bp->b_addr; + word = 0; + } else { + /* + * Go on to the next word in the buffer. + */ + b++; + } + } + /* + * If not ending on a word boundary, deal with the last + * (partial) word. + */ + if ((lastbit = len - i)) { + /* + * Mask of relevant bits. + */ + mask = ((xfs_rtword_t)1 << lastbit) - 1; + /* + * Compute difference between actual and desired value. + */ + if ((wdiff = (*b ^ val) & mask)) { + /* + * Different, compute first wrong bit and return. + */ + xfs_trans_brelse(tp, bp); + i += XFS_RTLOBIT(wdiff); + *new = start + i; + *stat = 0; + return 0; + } else + i = len; + } + /* + * Successful, return. + */ + xfs_trans_brelse(tp, bp); + *new = start + i; + *stat = 1; + return 0; +} + +#ifdef DEBUG +/* + * Check that the given extent (block range) is allocated already. + */ +STATIC int /* error */ +xfs_rtcheck_alloc_range( + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number of extent */ + xfs_extlen_t len) /* length of extent */ +{ + xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */ + int stat; + int error; + + error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat); + if (error) + return error; + ASSERT(stat); + return 0; +} +#else +#define xfs_rtcheck_alloc_range(m,t,b,l) (0) +#endif +/* + * Free an extent in the realtime subvolume. Length is expressed in + * realtime extents, as is the block number. + */ +int /* error */ +xfs_rtfree_extent( + xfs_trans_t *tp, /* transaction pointer */ + xfs_rtblock_t bno, /* starting block number to free */ + xfs_extlen_t len) /* length of extent freed */ +{ + int error; /* error value */ + xfs_mount_t *mp; /* file system mount structure */ + xfs_fsblock_t sb; /* summary file block number */ + xfs_buf_t *sumbp = NULL; /* summary file block buffer */ + + mp = tp->t_mountp; + + ASSERT(mp->m_rbmip->i_itemp != NULL); + ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); + + error = xfs_rtcheck_alloc_range(mp, tp, bno, len); + if (error) + return error; + + /* + * Free the range of realtime blocks. + */ + error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb); + if (error) { + return error; + } + /* + * Mark more blocks free in the superblock. + */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len); + /* + * If we've now freed all the blocks, reset the file sequence + * number to 0. + */ + if (tp->t_frextents_delta + mp->m_sb.sb_frextents == + mp->m_sb.sb_rextents) { + if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) + mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; + *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0; + xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); + } + return 0; +} + diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c deleted file mode 100644 index defb2febaaf..00000000000 --- a/fs/xfs/xfs_rw.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_attr.h" -#include "xfs_bmap.h" -#include "xfs_acl.h" -#include "xfs_mac.h" -#include "xfs_error.h" -#include "xfs_buf_item.h" -#include "xfs_rw.h" - -/* - * This is a subroutine for xfs_write() and other writers (xfs_ioctl) - * which clears the setuid and setgid bits when a file is written. - */ -int -xfs_write_clear_setuid( - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - xfs_trans_t *tp; - int error; - - mp = ip->i_mount; - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - if ((error = xfs_trans_reserve(tp, 0, - XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) { - ip->i_d.di_mode &= ~S_ISGID; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return 0; -} - -/* - * Handle logging requirements of various synchronous types of write. - */ -int -xfs_write_sync_logforce( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - int error = 0; - - /* - * If we're treating this as O_DSYNC and we have not updated the - * size, force the log. - */ - if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && - !(ip->i_update_size)) { - xfs_inode_log_item_t *iip = ip->i_itemp; - - /* - * If an allocation transaction occurred - * without extending the size, then we have to force - * the log up the proper point to ensure that the - * allocation is permanent. We can't count on - * the fact that buffered writes lock out direct I/O - * writes - the direct I/O write could have extended - * the size nontransactionally, then finished before - * we started. xfs_write_file will think that the file - * didn't grow but the update isn't safe unless the - * size change is logged. - * - * Force the log if we've committed a transaction - * against the inode or if someone else has and - * the commit record hasn't gone to disk (e.g. - * the inode is pinned). This guarantees that - * all changes affecting the inode are permanent - * when we return. - */ - if (iip && iip->ili_last_lsn) { - xfs_log_force(mp, iip->ili_last_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC); - } else if (xfs_ipincount(ip) > 0) { - xfs_log_force(mp, (xfs_lsn_t)0, - XFS_LOG_FORCE | XFS_LOG_SYNC); - } - - } else { - xfs_trans_t *tp; - - /* - * O_SYNC or O_DSYNC _with_ a size update are handled - * the same way. - * - * If the write was synchronous then we need to make - * sure that the inode modification time is permanent. - * We'll have updated the timestamp above, so here - * we use a synchronous transaction to log the inode. - * It's not fast, but it's necessary. - * - * If this a dsync write and the size got changed - * non-transactionally, then we need to ensure that - * the size change gets logged in a synchronous - * transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); - if ((error = xfs_trans_reserve(tp, 0, - XFS_SWRITE_LOG_RES(mp), - 0, 0, 0))) { - /* Transaction reserve failed */ - xfs_trans_cancel(tp, 0); - } else { - /* Transaction reserve successful */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - } - - return error; -} - -/* - * Force a shutdown of the filesystem instantly while keeping - * the filesystem consistent. We don't do an unmount here; just shutdown - * the shop, make sure that absolutely nothing persistent happens to - * this filesystem after this point. - */ - -void -xfs_do_force_shutdown( - bhv_desc_t *bdp, - int flags, - char *fname, - int lnnum) -{ - int logerror; - xfs_mount_t *mp; - - mp = XFS_BHVTOM(bdp); - logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " - "line %d of file %s. Return address = 0x%p", - mp->m_fsname, flags, lnnum, fname, __return_address); - } - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, - "Corruption of in-memory data detected. Shutting down filesystem: %s", - mp->m_fsname); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) { - xfs_stack_trace(); - } - } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - if (logerror) { - xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, - "Log I/O Error Detected. Shutting down filesystem: %s", - mp->m_fsname); - } else if (flags & SHUTDOWN_DEVICE_REQ) { - xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, - "All device paths lost. Shutting down filesystem: %s", - mp->m_fsname); - } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { - xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, - "I/O Error Detected. Shutting down filesystem: %s", - mp->m_fsname); - } - } - if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { - cmn_err(CE_ALERT, "Please umount the filesystem, " - "and rectify the problem(s)"); - } -} - - -/* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call biodone - * so that the proper iodone callbacks get called. - */ -int -xfs_bioerror( - xfs_buf_t *bp) -{ - -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - */ - xfs_buftrace("XFS IOERROR", bp); - XFS_BUF_ERROR(bp, EIO); - /* - * We're calling biodone, so delete B_DONE flag. Either way - * we have to call the iodone callback, and calling biodone - * probably is the best way since it takes care of - * GRIO as well. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_UNDONE(bp); - XFS_BUF_STALE(bp); - - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - xfs_biodone(bp); - - return (EIO); -} - -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the biodone call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - xfs_buf_t *bp) -{ - int64_t fl; - - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); - ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); - - xfs_buftrace("XFS IOERRELSE", bp); - fl = XFS_BUF_BFLAGS(bp); - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_DONE(bp); - XFS_BUF_STALE(bp); - XFS_BUF_CLR_IODONE_FUNC(bp); - XFS_BUF_CLR_BDSTRAT_FUNC(bp); - if (!(fl & XFS_B_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - XFS_BUF_ERROR(bp, EIO); - XFS_BUF_V_IODONESEMA(bp); - } else { - xfs_buf_relse(bp); - } - return (EIO); -} - -/* - * Prints out an ALERT message about I/O error. - */ -void -xfs_ioerror_alert( - char *func, - struct xfs_mount *mp, - xfs_buf_t *bp, - xfs_daddr_t blkno) -{ - cmn_err(CE_ALERT, - "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" - " (\"%s\") error %d buf count %zd", - (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, - XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), - (__uint64_t)blkno, func, - XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); -} - -/* - * This isn't an absolute requirement, but it is - * just a good idea to call xfs_read_buf instead of - * directly doing a read_buf call. For one, we shouldn't - * be doing this disk read if we are in SHUTDOWN state anyway, - * so this stops that from happening. Secondly, this does all - * the error checking stuff and the brelse if appropriate for - * the caller, so the code can be a little leaner. - */ - -int -xfs_read_buf( - struct xfs_mount *mp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) -{ - xfs_buf_t *bp; - int error; - - if (flags) - bp = xfs_buf_read_flags(target, blkno, len, flags); - else - bp = xfs_buf_read(target, blkno, len, flags); - if (!bp) - return XFS_ERROR(EIO); - error = XFS_BUF_GETERROR(bp); - if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { - *bpp = bp; - } else { - *bpp = NULL; - if (error) { - xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); - } else { - error = XFS_ERROR(EIO); - } - if (bp) { - XFS_BUF_UNDONE(bp); - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); - /* - * brelse clears B_ERROR and b_error - */ - xfs_buf_relse(bp); - } - } - return (error); -} - -/* - * Wrapper around bwrite() so that we can trap - * write errors, and act accordingly. - */ -int -xfs_bwrite( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - int error; - - /* - * XXXsup how does this work for quotas. - */ - XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE3(bp, mp); - XFS_BUF_WRITE(bp); - - if ((error = XFS_bwrite(bp))) { - ASSERT(mp); - /* - * Cannot put a buftrace here since if the buffer is not - * B_HOLD then we will brelse() the buffer before returning - * from bwrite and we could be tracing a buffer that has - * been reused. - */ - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - return (error); -} diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h deleted file mode 100644 index 188b296ff50..00000000000 --- a/fs/xfs/xfs_rw.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_RW_H__ -#define __XFS_RW_H__ - -struct xfs_buf; -struct xfs_inode; -struct xfs_mount; - -/* - * Maximum count of bmaps used by read and write paths. - */ -#define XFS_MAX_RW_NBMAPS 4 - -/* - * Counts of readahead buffers to use based on physical memory size. - * None of these should be more than XFS_MAX_RW_NBMAPS. - */ -#define XFS_RW_NREADAHEAD_16MB 2 -#define XFS_RW_NREADAHEAD_32MB 3 -#define XFS_RW_NREADAHEAD_K32 4 -#define XFS_RW_NREADAHEAD_K64 4 - -/* - * Maximum size of a buffer that we\'ll map. Making this - * too big will degrade performance due to the number of - * pages which need to be gathered. Making it too small - * will prevent us from doing large I/O\'s to hardware that - * needs it. - * - * This is currently set to 512 KB. - */ -#define XFS_MAX_BMAP_LEN_BB 1024 -#define XFS_MAX_BMAP_LEN_BYTES 524288 - -/* - * Convert the given file system block to a disk block. - * We have to treat it differently based on whether the - * file is a real time file or not, because the bmap code - * does. - */ -#define XFS_FSB_TO_DB(ip,fsb) xfs_fsb_to_db(ip,fsb) -static inline xfs_daddr_t -xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) -{ - return (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) ? \ - (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); -} -#define XFS_FSB_TO_DB_IO(io,fsb) xfs_fsb_to_db_io(io,fsb) -static inline xfs_daddr_t -xfs_fsb_to_db_io(struct xfs_iocore *io, xfs_fsblock_t fsb) -{ - return (((io)->io_flags & XFS_IOCORE_RT) ? \ - XFS_FSB_TO_BB((io)->io_mount, (fsb)) : \ - XFS_FSB_TO_DADDR((io)->io_mount, (fsb))); -} - -/* - * Prototypes for functions in xfs_rw.c. - */ -extern int xfs_write_clear_setuid(struct xfs_inode *ip); -extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip); -extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); -extern int xfs_bioerror(struct xfs_buf *bp); -extern int xfs_bioerror_relse(struct xfs_buf *bp); -extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp, - xfs_daddr_t blkno, int len, uint flags, - struct xfs_buf **bpp); -extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, - xfs_buf_t *bp, xfs_daddr_t blkno); - -/* - * Prototypes for functions in xfs_vnodeops.c. - */ -extern int xfs_rwlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); -extern void xfs_rwunlock(bhv_desc_t *bdp, bhv_vrwlock_t write_lock); -extern int xfs_setattr(bhv_desc_t *, bhv_vattr_t *vap, int flags, - cred_t *credp); -extern int xfs_change_file_space(bhv_desc_t *bdp, int cmd, xfs_flock64_t *bf, - xfs_off_t offset, cred_t *credp, int flags); -extern int xfs_set_dmattrs(bhv_desc_t *bdp, u_int evmask, u_int16_t state, - cred_t *credp); - -#endif /* __XFS_RW_H__ */ diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c new file mode 100644 index 00000000000..7703fa6770f --- /dev/null +++ b/fs/xfs/xfs_sb.c @@ -0,0 +1,836 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_dinode.h" +#include "xfs_bmap_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" + +/* + * Physical superblock buffer manipulations. Shared with libxfs in userspace. + */ + +static const struct { + short offset; + short type; /* 0 = integer + * 1 = binary / string (no translation) + */ +} xfs_sb_info[] = { + { offsetof(xfs_sb_t, sb_magicnum), 0 }, + { offsetof(xfs_sb_t, sb_blocksize), 0 }, + { offsetof(xfs_sb_t, sb_dblocks), 0 }, + { offsetof(xfs_sb_t, sb_rblocks), 0 }, + { offsetof(xfs_sb_t, sb_rextents), 0 }, + { offsetof(xfs_sb_t, sb_uuid), 1 }, + { offsetof(xfs_sb_t, sb_logstart), 0 }, + { offsetof(xfs_sb_t, sb_rootino), 0 }, + { offsetof(xfs_sb_t, sb_rbmino), 0 }, + { offsetof(xfs_sb_t, sb_rsumino), 0 }, + { offsetof(xfs_sb_t, sb_rextsize), 0 }, + { offsetof(xfs_sb_t, sb_agblocks), 0 }, + { offsetof(xfs_sb_t, sb_agcount), 0 }, + { offsetof(xfs_sb_t, sb_rbmblocks), 0 }, + { offsetof(xfs_sb_t, sb_logblocks), 0 }, + { offsetof(xfs_sb_t, sb_versionnum), 0 }, + { offsetof(xfs_sb_t, sb_sectsize), 0 }, + { offsetof(xfs_sb_t, sb_inodesize), 0 }, + { offsetof(xfs_sb_t, sb_inopblock), 0 }, + { offsetof(xfs_sb_t, sb_fname[0]), 1 }, + { offsetof(xfs_sb_t, sb_blocklog), 0 }, + { offsetof(xfs_sb_t, sb_sectlog), 0 }, + { offsetof(xfs_sb_t, sb_inodelog), 0 }, + { offsetof(xfs_sb_t, sb_inopblog), 0 }, + { offsetof(xfs_sb_t, sb_agblklog), 0 }, + { offsetof(xfs_sb_t, sb_rextslog), 0 }, + { offsetof(xfs_sb_t, sb_inprogress), 0 }, + { offsetof(xfs_sb_t, sb_imax_pct), 0 }, + { offsetof(xfs_sb_t, sb_icount), 0 }, + { offsetof(xfs_sb_t, sb_ifree), 0 }, + { offsetof(xfs_sb_t, sb_fdblocks), 0 }, + { offsetof(xfs_sb_t, sb_frextents), 0 }, + { offsetof(xfs_sb_t, sb_uquotino), 0 }, + { offsetof(xfs_sb_t, sb_gquotino), 0 }, + { offsetof(xfs_sb_t, sb_qflags), 0 }, + { offsetof(xfs_sb_t, sb_flags), 0 }, + { offsetof(xfs_sb_t, sb_shared_vn), 0 }, + { offsetof(xfs_sb_t, sb_inoalignmt), 0 }, + { offsetof(xfs_sb_t, sb_unit), 0 }, + { offsetof(xfs_sb_t, sb_width), 0 }, + { offsetof(xfs_sb_t, sb_dirblklog), 0 }, + { offsetof(xfs_sb_t, sb_logsectlog), 0 }, + { offsetof(xfs_sb_t, sb_logsectsize), 0 }, + { offsetof(xfs_sb_t, sb_logsunit), 0 }, + { offsetof(xfs_sb_t, sb_features2), 0 }, + { offsetof(xfs_sb_t, sb_bad_features2), 0 }, + { offsetof(xfs_sb_t, sb_features_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_ro_compat), 0 }, + { offsetof(xfs_sb_t, sb_features_incompat), 0 }, + { offsetof(xfs_sb_t, sb_features_log_incompat), 0 }, + { offsetof(xfs_sb_t, sb_crc), 0 }, + { offsetof(xfs_sb_t, sb_pad), 0 }, + { offsetof(xfs_sb_t, sb_pquotino), 0 }, + { offsetof(xfs_sb_t, sb_lsn), 0 }, + { sizeof(xfs_sb_t), 0 } +}; + +/* + * Reference counting access wrappers to the perag structures. + * Because we never free per-ag structures, the only thing we + * have to protect against changes is the tree structure itself. + */ +struct xfs_perag * +xfs_perag_get( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + rcu_read_lock(); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + ref = atomic_inc_return(&pag->pag_ref); + } + rcu_read_unlock(); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +/* + * search from @first to find the next perag with the given tag set. + */ +struct xfs_perag * +xfs_perag_get_tag( + struct xfs_mount *mp, + xfs_agnumber_t first, + int tag) +{ + struct xfs_perag *pag; + int found; + int ref; + + rcu_read_lock(); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, first, 1, tag); + if (found <= 0) { + rcu_read_unlock(); + return NULL; + } + ref = atomic_inc_return(&pag->pag_ref); + rcu_read_unlock(); + trace_xfs_perag_get_tag(mp, pag->pag_agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put( + struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + +/* + * Check the validity of the SB found. + */ +STATIC int +xfs_mount_validate_sb( + xfs_mount_t *mp, + xfs_sb_t *sbp, + bool check_inprogress, + bool check_version) +{ + + /* + * If the log device and data device have the + * same device number, the log is internal. + * Consequently, the sb_logstart should be non-zero. If + * we have a zero sb_logstart in this case, we may be trying to mount + * a volume filesystem in a non-volume manner. + */ + if (sbp->sb_magicnum != XFS_SB_MAGIC) { + xfs_warn(mp, "bad magic number"); + return XFS_ERROR(EWRONGFS); + } + + + if (!xfs_sb_good_version(sbp)) { + xfs_warn(mp, "bad version"); + return XFS_ERROR(EWRONGFS); + } + + /* + * Version 5 superblock feature mask validation. Reject combinations the + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. + */ + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (xfs_sb_has_compat_feature(sbp, + XFS_SB_FEAT_COMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown compatible features (0x%x) enabled.\n" +"Using a more recent kernel is recommended.", + (sbp->sb_features_compat & + XFS_SB_FEAT_COMPAT_UNKNOWN)); + } + + if (xfs_sb_has_ro_compat_feature(sbp, + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, +"Superblock has unknown read-only compatible features (0x%x) enabled.", + (sbp->sb_features_ro_compat & + XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, +"Attempted to mount read-only compatible filesystem read-write.\n" +"Filesystem can only be safely mounted read only."); + return XFS_ERROR(EINVAL); + } + } + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { + xfs_warn(mp, +"Superblock has unknown incompatible features (0x%x) enabled.\n" +"Filesystem can not be safely mounted by this kernel.", + (sbp->sb_features_incompat & + XFS_SB_FEAT_INCOMPAT_UNKNOWN)); + return XFS_ERROR(EINVAL); + } + } + + if (xfs_sb_version_has_pquotino(sbp)) { + if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { + xfs_notice(mp, + "Version 5 of Super block has XFS_OQUOTA bits."); + return XFS_ERROR(EFSCORRUPTED); + } + } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | + XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { + xfs_notice(mp, +"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); + return XFS_ERROR(EFSCORRUPTED); + } + + if (unlikely( + sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an external log; " + "specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + if (unlikely( + sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { + xfs_warn(mp, + "filesystem is marked as having an internal log; " + "do not specify logdev on the mount command line."); + return XFS_ERROR(EINVAL); + } + + /* + * More sanity checking. Most of these were stolen directly from + * xfs_repair. + */ + if (unlikely( + sbp->sb_agcount <= 0 || + sbp->sb_sectsize < XFS_MIN_SECTORSIZE || + sbp->sb_sectsize > XFS_MAX_SECTORSIZE || + sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || + sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || + sbp->sb_sectsize != (1 << sbp->sb_sectlog) || + sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || + sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || + sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || + sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || + (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || + (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || + sbp->sb_dblocks == 0 || + sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || + sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || + sbp->sb_shared_vn != 0)) { + xfs_notice(mp, "SB sanity check failed"); + return XFS_ERROR(EFSCORRUPTED); + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + sbp->sb_blocksize, PAGE_SIZE); + return XFS_ERROR(ENOSYS); + } + + /* + * Currently only very few inode sizes are supported. + */ + switch (sbp->sb_inodesize) { + case 256: + case 512: + case 1024: + case 2048: + break; + default: + xfs_warn(mp, "inode size of %d bytes not supported", + sbp->sb_inodesize); + return XFS_ERROR(ENOSYS); + } + + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + return XFS_ERROR(EFBIG); + } + + if (check_inprogress && sbp->sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + return XFS_ERROR(EFSCORRUPTED); + } + return 0; +} + +void +xfs_sb_quota_from_disk(struct xfs_sb *sbp) +{ + /* + * older mkfs doesn't initialize quota inodes to NULLFSINO. This + * leads to in-core values having two different values for a quota + * inode to be invalid: 0 and NULLFSINO. Change it to a single value + * NULLFSINO. + * + * Note that this change affect only the in-core values. These + * values are not written back to disk unless any quota information + * is written to the disk. Even in that case, sb_pquotino field is + * not written to disk unless the superblock supports pquotino. + */ + if (sbp->sb_uquotino == 0) + sbp->sb_uquotino = NULLFSINO; + if (sbp->sb_gquotino == 0) + sbp->sb_gquotino = NULLFSINO; + if (sbp->sb_pquotino == 0) + sbp->sb_pquotino = NULLFSINO; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(sbp)) + return; + + if (sbp->sb_qflags & XFS_OQUOTA_ENFD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; + if (sbp->sb_qflags & XFS_OQUOTA_CHKD) + sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? + XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; + sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); + + if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + /* + * In older version of superblock, on-disk superblock only + * has sb_gquotino, and in-core superblock has both sb_gquotino + * and sb_pquotino. But, only one of them is supported at any + * point of time. So, if PQUOTA is set in disk superblock, + * copy over sb_gquotino to sb_pquotino. + */ + sbp->sb_pquotino = sbp->sb_gquotino; + sbp->sb_gquotino = NULLFSINO; + } +} + +void +xfs_sb_from_disk( + struct xfs_sb *to, + xfs_dsb_t *from) +{ + to->sb_magicnum = be32_to_cpu(from->sb_magicnum); + to->sb_blocksize = be32_to_cpu(from->sb_blocksize); + to->sb_dblocks = be64_to_cpu(from->sb_dblocks); + to->sb_rblocks = be64_to_cpu(from->sb_rblocks); + to->sb_rextents = be64_to_cpu(from->sb_rextents); + memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); + to->sb_logstart = be64_to_cpu(from->sb_logstart); + to->sb_rootino = be64_to_cpu(from->sb_rootino); + to->sb_rbmino = be64_to_cpu(from->sb_rbmino); + to->sb_rsumino = be64_to_cpu(from->sb_rsumino); + to->sb_rextsize = be32_to_cpu(from->sb_rextsize); + to->sb_agblocks = be32_to_cpu(from->sb_agblocks); + to->sb_agcount = be32_to_cpu(from->sb_agcount); + to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); + to->sb_logblocks = be32_to_cpu(from->sb_logblocks); + to->sb_versionnum = be16_to_cpu(from->sb_versionnum); + to->sb_sectsize = be16_to_cpu(from->sb_sectsize); + to->sb_inodesize = be16_to_cpu(from->sb_inodesize); + to->sb_inopblock = be16_to_cpu(from->sb_inopblock); + memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); + to->sb_blocklog = from->sb_blocklog; + to->sb_sectlog = from->sb_sectlog; + to->sb_inodelog = from->sb_inodelog; + to->sb_inopblog = from->sb_inopblog; + to->sb_agblklog = from->sb_agblklog; + to->sb_rextslog = from->sb_rextslog; + to->sb_inprogress = from->sb_inprogress; + to->sb_imax_pct = from->sb_imax_pct; + to->sb_icount = be64_to_cpu(from->sb_icount); + to->sb_ifree = be64_to_cpu(from->sb_ifree); + to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); + to->sb_frextents = be64_to_cpu(from->sb_frextents); + to->sb_uquotino = be64_to_cpu(from->sb_uquotino); + to->sb_gquotino = be64_to_cpu(from->sb_gquotino); + to->sb_qflags = be16_to_cpu(from->sb_qflags); + to->sb_flags = from->sb_flags; + to->sb_shared_vn = from->sb_shared_vn; + to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); + to->sb_unit = be32_to_cpu(from->sb_unit); + to->sb_width = be32_to_cpu(from->sb_width); + to->sb_dirblklog = from->sb_dirblklog; + to->sb_logsectlog = from->sb_logsectlog; + to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); + to->sb_logsunit = be32_to_cpu(from->sb_logsunit); + to->sb_features2 = be32_to_cpu(from->sb_features2); + to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); + to->sb_features_compat = be32_to_cpu(from->sb_features_compat); + to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); + to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); + to->sb_features_log_incompat = + be32_to_cpu(from->sb_features_log_incompat); + to->sb_pad = 0; + to->sb_pquotino = be64_to_cpu(from->sb_pquotino); + to->sb_lsn = be64_to_cpu(from->sb_lsn); +} + +static inline void +xfs_sb_quota_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t *fields) +{ + __uint16_t qflags = from->sb_qflags; + + /* + * We need to do these manipilations only if we are working + * with an older version of on-disk superblock. + */ + if (xfs_sb_version_has_pquotino(from)) + return; + + if (*fields & XFS_SB_QFLAGS) { + /* + * The in-core version of sb_qflags do not have + * XFS_OQUOTA_* flags, whereas the on-disk version + * does. So, convert incore XFS_{PG}QUOTA_* flags + * to on-disk XFS_OQUOTA_* flags. + */ + qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | + XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); + + if (from->sb_qflags & + (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) + qflags |= XFS_OQUOTA_ENFD; + if (from->sb_qflags & + (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) + qflags |= XFS_OQUOTA_CHKD; + to->sb_qflags = cpu_to_be16(qflags); + *fields &= ~XFS_SB_QFLAGS; + } + + /* + * GQUOTINO and PQUOTINO cannot be used together in versions of + * superblock that do not have pquotino. from->sb_flags tells us which + * quota is active and should be copied to disk. If neither are active, + * make sure we write NULLFSINO to the sb_gquotino field as a quota + * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature + * bit is set. + * + * Note that we don't need to handle the sb_uquotino or sb_pquotino here + * as they do not require any translation. Hence the main sb field loop + * will write them appropriately from the in-core superblock. + */ + if ((*fields & XFS_SB_GQUOTINO) && + (from->sb_qflags & XFS_GQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_gquotino); + else if ((*fields & XFS_SB_PQUOTINO) && + (from->sb_qflags & XFS_PQUOTA_ACCT)) + to->sb_gquotino = cpu_to_be64(from->sb_pquotino); + else { + /* + * We can't rely on just the fields being logged to tell us + * that it is safe to write NULLFSINO - we should only do that + * if quotas are not actually enabled. Hence only write + * NULLFSINO if both in-core quota inodes are NULL. + */ + if (from->sb_gquotino == NULLFSINO && + from->sb_pquotino == NULLFSINO) + to->sb_gquotino = cpu_to_be64(NULLFSINO); + } + + *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO); +} + +/* + * Copy in core superblock to ondisk one. + * + * The fields argument is mask of superblock fields to copy. + */ +void +xfs_sb_to_disk( + xfs_dsb_t *to, + xfs_sb_t *from, + __int64_t fields) +{ + xfs_caddr_t to_ptr = (xfs_caddr_t)to; + xfs_caddr_t from_ptr = (xfs_caddr_t)from; + xfs_sb_field_t f; + int first; + int size; + + ASSERT(fields); + if (!fields) + return; + + xfs_sb_quota_to_disk(to, from, &fields); + while (fields) { + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + first = xfs_sb_info[f].offset; + size = xfs_sb_info[f + 1].offset - first; + + ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1); + + if (size == 1 || xfs_sb_info[f].type == 1) { + memcpy(to_ptr + first, from_ptr + first, size); + } else { + switch (size) { + case 2: + *(__be16 *)(to_ptr + first) = + cpu_to_be16(*(__u16 *)(from_ptr + first)); + break; + case 4: + *(__be32 *)(to_ptr + first) = + cpu_to_be32(*(__u32 *)(from_ptr + first)); + break; + case 8: + *(__be64 *)(to_ptr + first) = + cpu_to_be64(*(__u64 *)(from_ptr + first)); + break; + default: + ASSERT(0); + } + } + + fields &= ~(1LL << f); + } +} + +static int +xfs_sb_verify( + struct xfs_buf *bp, + bool check_version) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_sb sb; + + xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); + + /* + * Only check the in progress field for the primary superblock as + * mkfs.xfs doesn't clear it from secondary superblocks. + */ + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); +} + +/* + * If the superblock has the CRC feature bit set or the CRC field is non-null, + * check that the CRC is valid. We check the CRC field is non-null because a + * single bit error could clear the feature bit and unused parts of the + * superblock are supposed to be zero. Hence a non-null crc field indicates that + * we've potentially lost a feature bit and we should check it anyway. + * + * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the + * last field in V4 secondary superblocks. So for secondary superblocks, + * we are more forgiving, and ignore CRC failures if the primary doesn't + * indicate that the fs version is V5. + */ +static void +xfs_sb_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + int error; + + /* + * open code the version check to avoid needing to convert the entire + * superblock from disk order just to check the version number + */ + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && + (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == + XFS_SB_VERSION_5) || + dsb->sb_crc != 0)) { + + if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { + /* Only fail bad secondaries on a known V5 filesystem */ + if (bp->b_bn == XFS_SB_DADDR || + xfs_sb_version_hascrc(&mp->m_sb)) { + error = EFSBADCRC; + goto out_error; + } + } + } + error = xfs_sb_verify(bp, true); + +out_error: + if (error) { + xfs_buf_ioerror(bp, error); + if (error == EFSCORRUPTED || error == EFSBADCRC) + xfs_verifier_error(bp); + } +} + +/* + * We may be probed for a filesystem match, so we may not want to emit + * messages when the superblock buffer is not actually an XFS superblock. + * If we find an XFS superblock, then run a normal, noisy mount because we are + * really going to mount it and want to know about errors. + */ +static void +xfs_sb_quiet_read_verify( + struct xfs_buf *bp) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { + /* XFS filesystem, verify noisily! */ + xfs_sb_read_verify(bp); + return; + } + /* quietly fail */ + xfs_buf_ioerror(bp, EWRONGFS); +} + +static void +xfs_sb_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + int error; + + error = xfs_sb_verify(bp, false); + if (error) { + xfs_buf_ioerror(bp, error); + xfs_verifier_error(bp); + return; + } + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (bip) + XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); + + xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); +} + +const struct xfs_buf_ops xfs_sb_buf_ops = { + .verify_read = xfs_sb_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .verify_read = xfs_sb_quiet_read_verify, + .verify_write = xfs_sb_write_verify, +}; + +/* + * xfs_mount_common + * + * Mount initialization code establishing various mount + * fields from the superblock associated with the given + * mount structure + */ +void +xfs_sb_mount_common( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + mp->m_agfrotor = mp->m_agirotor = 0; + spin_lock_init(&mp->m_agirotor_lock); + mp->m_maxagi = mp->m_sb.sb_agcount; + mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; + mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; + mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; + mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; + mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog; + mp->m_blockmask = sbp->sb_blocksize - 1; + mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; + mp->m_blockwmask = mp->m_blockwsize - 1; + + mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; + mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; + + mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2; + mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2; + + mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); + mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); + mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; + mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; + + mp->m_bsize = XFS_FSB_TO_BB(mp, 1); + mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, + sbp->sb_inopblock); + mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; +} + +/* + * xfs_initialize_perag_data + * + * Read in each per-ag structure so we can count up the number of + * allocated inodes, free inodes and used filesystem blocks as this + * information is no longer persistent in the superblock. Once we have + * this information, write it into the in-core superblock structure. + */ +int +xfs_initialize_perag_data( + struct xfs_mount *mp, + xfs_agnumber_t agcount) +{ + xfs_agnumber_t index; + xfs_perag_t *pag; + xfs_sb_t *sbp = &mp->m_sb; + uint64_t ifree = 0; + uint64_t ialloc = 0; + uint64_t bfree = 0; + uint64_t bfreelst = 0; + uint64_t btree = 0; + int error; + + for (index = 0; index < agcount; index++) { + /* + * read the agf, then the agi. This gets us + * all the information we need and populates the + * per-ag structures for us. + */ + error = xfs_alloc_pagf_init(mp, NULL, index, 0); + if (error) + return error; + + error = xfs_ialloc_pagi_init(mp, NULL, index); + if (error) + return error; + pag = xfs_perag_get(mp, index); + ifree += pag->pagi_freecount; + ialloc += pag->pagi_count; + bfree += pag->pagf_freeblks; + bfreelst += pag->pagf_flcount; + btree += pag->pagf_btreeblks; + xfs_perag_put(pag); + } + /* + * Overwrite incore superblock counters with just-read data + */ + spin_lock(&mp->m_sb_lock); + sbp->sb_ifree = ifree; + sbp->sb_icount = ialloc; + sbp->sb_fdblocks = bfree + bfreelst + btree; + spin_unlock(&mp->m_sb_lock); + + /* Fixup the per-cpu counters as well. */ + xfs_icsb_reinit_counters(mp); + + return 0; +} + +/* + * xfs_mod_sb() can be used to copy arbitrary changes to the + * in-core superblock into the superblock buffer to be logged. + * It does not provide the higher level of locking that is + * needed to protect the in-core superblock from concurrent + * access. + */ +void +xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) +{ + xfs_buf_t *bp; + int first; + int last; + xfs_mount_t *mp; + xfs_sb_field_t f; + + ASSERT(fields); + if (!fields) + return; + mp = tp->t_mountp; + bp = xfs_trans_getsb(tp, mp, 0); + first = sizeof(xfs_sb_t); + last = 0; + + /* translate/copy */ + + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); + + /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; + + f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + first = xfs_sb_info[f].offset; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(tp, bp, first, last); +} diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h index bf168a91ddb..c43c2d609a2 100644 --- a/fs/xfs/xfs_sb.h +++ b/fs/xfs/xfs_sb.h @@ -26,16 +26,16 @@ struct xfs_buf; struct xfs_mount; +struct xfs_trans; #define XFS_SB_MAGIC 0x58465342 /* 'XFSB' */ #define XFS_SB_VERSION_1 1 /* 5.3, 6.0.1, 6.1 */ #define XFS_SB_VERSION_2 2 /* 6.2 - attributes */ #define XFS_SB_VERSION_3 3 /* 6.2 - new inode version */ #define XFS_SB_VERSION_4 4 /* 6.2+ - bitmask version */ +#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */ #define XFS_SB_VERSION_NUMBITS 0x000f #define XFS_SB_VERSION_ALLFBITS 0xfff0 -#define XFS_SB_VERSION_SASHFBITS 0xf000 -#define XFS_SB_VERSION_REALFBITS 0x0ff0 #define XFS_SB_VERSION_ATTRBIT 0x0010 #define XFS_SB_VERSION_NLINKBIT 0x0020 #define XFS_SB_VERSION_QUOTABIT 0x0040 @@ -46,28 +46,17 @@ struct xfs_mount; #define XFS_SB_VERSION_SECTORBIT 0x0800 #define XFS_SB_VERSION_EXTFLGBIT 0x1000 #define XFS_SB_VERSION_DIRV2BIT 0x2000 +#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 -#define XFS_SB_VERSION_OKSASHFBITS \ - (XFS_SB_VERSION_EXTFLGBIT | \ - XFS_SB_VERSION_DIRV2BIT) -#define XFS_SB_VERSION_OKREALFBITS \ - (XFS_SB_VERSION_ATTRBIT | \ - XFS_SB_VERSION_NLINKBIT | \ - XFS_SB_VERSION_QUOTABIT | \ - XFS_SB_VERSION_ALIGNBIT | \ - XFS_SB_VERSION_DALIGNBIT | \ - XFS_SB_VERSION_SHAREDBIT | \ - XFS_SB_VERSION_LOGV2BIT | \ - XFS_SB_VERSION_SECTORBIT | \ - XFS_SB_VERSION_MOREBITSBIT) -#define XFS_SB_VERSION_OKSASHBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_REALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) -#define XFS_SB_VERSION_OKREALBITS \ - (XFS_SB_VERSION_NUMBITS | \ - XFS_SB_VERSION_OKREALFBITS | \ - XFS_SB_VERSION_OKSASHFBITS) + +/* + * Supported feature bit list is just all bits in the versionnum field because + * we've used them all up and understand them all. Except, of course, for the + * shared superblock bit, which nobody knows what it does and so is unsupported. + */ +#define XFS_SB_VERSION_OKBITS \ + ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \ + ~XFS_SB_VERSION_SHAREDBIT) /* * There are two words to hold XFS "feature" bits: the original @@ -76,25 +65,26 @@ struct xfs_mount; * * These defines represent bits in sb_features2. */ -#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */ #define XFS_SB_VERSION2_RESERVED1BIT 0x00000001 -#define XFS_SB_VERSION2_RESERVED2BIT 0x00000002 +#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */ #define XFS_SB_VERSION2_RESERVED4BIT 0x00000004 #define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */ -#define XFS_SB_VERSION2_SASHFBITS 0xff000000 /* Mask: features that - require changing - PROM and SASH */ - -#define XFS_SB_VERSION2_OKREALFBITS \ - (XFS_SB_VERSION2_ATTR2BIT) -#define XFS_SB_VERSION2_OKSASHFBITS \ - (0) -#define XFS_SB_VERSION2_OKREALBITS \ - (XFS_SB_VERSION2_OKREALFBITS | \ - XFS_SB_VERSION2_OKSASHFBITS ) - -typedef struct xfs_sb -{ +#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */ +#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */ +#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */ +#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */ + +#define XFS_SB_VERSION2_OKBITS \ + (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ + XFS_SB_VERSION2_ATTR2BIT | \ + XFS_SB_VERSION2_PROJID32BIT | \ + XFS_SB_VERSION2_FTYPE) + +/* + * Superblock - in core version. Must match the ondisk version below. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_sb { __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ __uint32_t sb_blocksize; /* logical block size, bytes */ xfs_drfsbno_t sb_dblocks; /* number of data blocks */ @@ -149,8 +139,118 @@ typedef struct xfs_sb __uint16_t sb_logsectsize; /* sector size for the log, bytes */ __uint32_t sb_logsunit; /* stripe unit size for the log */ __uint32_t sb_features2; /* additional feature bits */ + + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __uint32_t sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __uint32_t sb_features_compat; + __uint32_t sb_features_ro_compat; + __uint32_t sb_features_incompat; + __uint32_t sb_features_log_incompat; + + __uint32_t sb_crc; /* superblock crc */ + __uint32_t sb_pad; + + xfs_ino_t sb_pquotino; /* project quota inode */ + xfs_lsn_t sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ } xfs_sb_t; +#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc) + +/* + * Superblock - on disk version. Must match the in core version above. + * Must be padded to 64 bit alignment. + */ +typedef struct xfs_dsb { + __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ + __be32 sb_blocksize; /* logical block size, bytes */ + __be64 sb_dblocks; /* number of data blocks */ + __be64 sb_rblocks; /* number of realtime blocks */ + __be64 sb_rextents; /* number of realtime extents */ + uuid_t sb_uuid; /* file system unique id */ + __be64 sb_logstart; /* starting block of log if internal */ + __be64 sb_rootino; /* root inode number */ + __be64 sb_rbmino; /* bitmap inode for realtime extents */ + __be64 sb_rsumino; /* summary inode for rt bitmap */ + __be32 sb_rextsize; /* realtime extent size, blocks */ + __be32 sb_agblocks; /* size of an allocation group */ + __be32 sb_agcount; /* number of allocation groups */ + __be32 sb_rbmblocks; /* number of rt bitmap blocks */ + __be32 sb_logblocks; /* number of log blocks */ + __be16 sb_versionnum; /* header version == XFS_SB_VERSION */ + __be16 sb_sectsize; /* volume sector size, bytes */ + __be16 sb_inodesize; /* inode size, bytes */ + __be16 sb_inopblock; /* inodes per block */ + char sb_fname[12]; /* file system name */ + __u8 sb_blocklog; /* log2 of sb_blocksize */ + __u8 sb_sectlog; /* log2 of sb_sectsize */ + __u8 sb_inodelog; /* log2 of sb_inodesize */ + __u8 sb_inopblog; /* log2 of sb_inopblock */ + __u8 sb_agblklog; /* log2 of sb_agblocks (rounded up) */ + __u8 sb_rextslog; /* log2 of sb_rextents */ + __u8 sb_inprogress; /* mkfs is in progress, don't mount */ + __u8 sb_imax_pct; /* max % of fs for inode space */ + /* statistics */ + /* + * These fields must remain contiguous. If you really + * want to change their layout, make sure you fix the + * code in xfs_trans_apply_sb_deltas(). + */ + __be64 sb_icount; /* allocated inodes */ + __be64 sb_ifree; /* free inodes */ + __be64 sb_fdblocks; /* free data blocks */ + __be64 sb_frextents; /* free realtime extents */ + /* + * End contiguous fields. + */ + __be64 sb_uquotino; /* user quota inode */ + __be64 sb_gquotino; /* group quota inode */ + __be16 sb_qflags; /* quota flags */ + __u8 sb_flags; /* misc. flags */ + __u8 sb_shared_vn; /* shared version number */ + __be32 sb_inoalignmt; /* inode chunk alignment, fsblocks */ + __be32 sb_unit; /* stripe or raid unit */ + __be32 sb_width; /* stripe or raid width */ + __u8 sb_dirblklog; /* log2 of dir block size (fsbs) */ + __u8 sb_logsectlog; /* log2 of the log sector size */ + __be16 sb_logsectsize; /* sector size for the log, bytes */ + __be32 sb_logsunit; /* stripe unit size for the log */ + __be32 sb_features2; /* additional feature bits */ + /* + * bad features2 field as a result of failing to pad the sb + * structure to 64 bits. Some machines will be using this field + * for features2 bits. Easiest just to mark it bad and not use + * it for anything else. + */ + __be32 sb_bad_features2; + + /* version 5 superblock fields start here */ + + /* feature masks */ + __be32 sb_features_compat; + __be32 sb_features_ro_compat; + __be32 sb_features_incompat; + __be32 sb_features_log_incompat; + + __le32 sb_crc; /* superblock crc */ + __be32 sb_pad; + + __be64 sb_pquotino; /* project quota inode */ + __be64 sb_lsn; /* last write sequence */ + + /* must be padded to 64 bit alignment */ +} xfs_dsb_t; + /* * Sequence number values for the fields. */ @@ -167,7 +267,10 @@ typedef enum { XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, + XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, + XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, + XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, + XFS_SBS_PQUOTINO, XFS_SBS_LSN, XFS_SBS_FIELDCOUNT } xfs_sb_field_t; @@ -188,14 +291,27 @@ typedef enum { #define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) #define XFS_SB_UNIT XFS_SB_MVAL(UNIT) #define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) +#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) +#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) +#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) #define XFS_SB_FEATURES2 XFS_SB_MVAL(FEATURES2) +#define XFS_SB_BAD_FEATURES2 XFS_SB_MVAL(BAD_FEATURES2) +#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) +#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) +#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) +#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) +#define XFS_SB_CRC XFS_SB_MVAL(CRC) +#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) #define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) #define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) #define XFS_SB_MOD_BITS \ (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_FEATURES2) + XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ + XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \ + XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \ + XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO) /* @@ -212,248 +328,255 @@ typedef enum { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -#define XFS_SB_GOOD_VERSION(sbp) xfs_sb_good_version(sbp) -#ifdef __KERNEL__ -static inline int xfs_sb_good_version(xfs_sb_t *sbp) +/* + * The first XFS version we support is a v4 superblock with V2 directories. + */ +static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) { - return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ - (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ - (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))); + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + + /* check for unknown features in the fs */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + return true; } -#else -static inline int xfs_sb_good_version(xfs_sb_t *sbp) + +static inline bool xfs_sb_good_version(struct xfs_sb *sbp) { - return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \ - (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \ - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \ - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \ - (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \ - (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)))); + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) + return true; + if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) + return xfs_sb_good_v4_features(sbp); + return false; } -#endif /* __KERNEL__ */ - -#define XFS_SB_GOOD_SASH_VERSION(sbp) \ - ((((sbp)->sb_versionnum >= XFS_SB_VERSION_1) && \ - ((sbp)->sb_versionnum <= XFS_SB_VERSION_3)) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - !((sbp)->sb_versionnum & ~XFS_SB_VERSION_OKSASHBITS))) -#define XFS_SB_VERSION_TONEW(v) xfs_sb_version_tonew(v) -static inline unsigned xfs_sb_version_tonew(unsigned v) +/* + * Detect a mismatched features2 field. Older kernels read/wrote + * this into the wrong slot, so to be safe we keep them in sync. + */ +static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) { - return ((((v) == XFS_SB_VERSION_1) ? \ - 0 : \ - (((v) == XFS_SB_VERSION_2) ? \ - XFS_SB_VERSION_ATTRBIT : \ - (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \ - XFS_SB_VERSION_4); + return sbp->sb_bad_features2 != sbp->sb_features2; } -#define XFS_SB_VERSION_TOOLD(v) xfs_sb_version_toold(v) -static inline unsigned xfs_sb_version_toold(unsigned v) +static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) { - return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \ - 0 : \ - (((v) & XFS_SB_VERSION_NLINKBIT) ? \ - XFS_SB_VERSION_3 : \ - (((v) & XFS_SB_VERSION_ATTRBIT) ? \ - XFS_SB_VERSION_2 : \ - XFS_SB_VERSION_1))); + return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); } -#define XFS_SB_VERSION_HASATTR(sbp) xfs_sb_version_hasattr(sbp) -static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) { - return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \ - ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT)); + sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -#define XFS_SB_VERSION_ADDATTR(sbp) xfs_sb_version_addattr(sbp) -static inline void xfs_sb_version_addattr(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) { - (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \ - XFS_SB_VERSION_2 : \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \ - (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT))); + return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); } -#define XFS_SB_VERSION_HASNLINK(sbp) xfs_sb_version_hasnlink(sbp) -static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp) +static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { - return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \ - ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT)); + sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -#define XFS_SB_VERSION_ADDNLINK(sbp) xfs_sb_version_addnlink(sbp) -static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) { - (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \ - XFS_SB_VERSION_3 : \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT)); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); } -#define XFS_SB_VERSION_HASQUOTA(sbp) xfs_sb_version_hasquota(sbp) -static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); } -#define XFS_SB_VERSION_ADDQUOTA(sbp) xfs_sb_version_addquota(sbp) -static inline void xfs_sb_version_addquota(xfs_sb_t *sbp) +static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) { - (sbp)->sb_versionnum = \ - (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \ - (XFS_SB_VERSION_TONEW((sbp)->sb_versionnum) | \ - XFS_SB_VERSION_QUOTABIT)); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); } -#define XFS_SB_VERSION_HASALIGN(sbp) xfs_sb_version_hasalign(sbp) -static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); } -#define XFS_SB_VERSION_SUBALIGN(sbp) xfs_sb_version_subalign(sbp) -static inline void xfs_sb_version_subalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) { - (sbp)->sb_versionnum = \ - XFS_SB_VERSION_TOOLD((sbp)->sb_versionnum & ~XFS_SB_VERSION_ALIGNBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); } -#define XFS_SB_VERSION_HASDALIGN(sbp) xfs_sb_version_hasdalign(sbp) -static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); + return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); } -#define XFS_SB_VERSION_ADDDALIGN(sbp) xfs_sb_version_adddalign(sbp) -static inline int xfs_sb_version_adddalign(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_DALIGNBIT); + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } -#define XFS_SB_VERSION_HASSHARED(sbp) xfs_sb_version_hasshared(sbp) -static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp) +/* + * sb_features2 bit version macros. + */ +static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); } -#define XFS_SB_VERSION_ADDSHARED(sbp) xfs_sb_version_addshared(sbp) -static inline int xfs_sb_version_addshared(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_SHAREDBIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); } -#define XFS_SB_VERSION_SUBSHARED(sbp) xfs_sb_version_subshared(sbp) -static inline int xfs_sb_version_subshared(xfs_sb_t *sbp) +static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum & ~XFS_SB_VERSION_SHAREDBIT); + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -#define XFS_SB_VERSION_HASDIRV2(sbp) xfs_sb_version_hasdirv2(sbp) -static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp) +static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT); + sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; + if (!sbp->sb_features2) + sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; } -#define XFS_SB_VERSION_HASLOGV2(sbp) xfs_sb_version_haslogv2(sbp) -static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp) +static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); } -#define XFS_SB_VERSION_HASEXTFLGBIT(sbp) xfs_sb_version_hasextflgbit(sbp) -static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp) +static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT); + sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; + sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; + sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT; } -#define XFS_SB_VERSION_ADDEXTFLGBIT(sbp) xfs_sb_version_addextflgbit(sbp) -static inline int xfs_sb_version_addextflgbit(xfs_sb_t *sbp) +/* + * Extended v5 superblock feature masks. These are to be used for new v5 + * superblock features only. + * + * Compat features are new features that old kernels will not notice or affect + * and so can mount read-write without issues. + * + * RO-Compat (read only) are features that old kernels can read but will break + * if they write. Hence only read-only mounts of such filesystems are allowed on + * kernels that don't support the feature bit. + * + * InCompat features are features which old kernels will not understand and so + * must not mount. + * + * Log-InCompat features are for changes to log formats or new transactions that + * can't be replayed on older kernels. The fields are set when the filesystem is + * mounted, and a clean unmount clears the fields. + */ +#define XFS_SB_FEAT_COMPAT_ALL 0 +#define XFS_SB_FEAT_COMPAT_UNKNOWN ~XFS_SB_FEAT_COMPAT_ALL +static inline bool +xfs_sb_has_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_EXTFLGBIT); + return (sbp->sb_features_compat & feature) != 0; } -#define XFS_SB_VERSION_SUBEXTFLGBIT(sbp) xfs_sb_version_subextflgbit(sbp) -static inline int xfs_sb_version_subextflgbit(xfs_sb_t *sbp) +#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */ +#define XFS_SB_FEAT_RO_COMPAT_ALL \ + (XFS_SB_FEAT_RO_COMPAT_FINOBT) +#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL +static inline bool +xfs_sb_has_ro_compat_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return (sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum & ~XFS_SB_VERSION_EXTFLGBIT); + return (sbp->sb_features_ro_compat & feature) != 0; } -#define XFS_SB_VERSION_HASSECTOR(sbp) xfs_sb_version_hassector(sbp) -static inline int xfs_sb_version_hassector(xfs_sb_t *sbp) +#define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_ALL \ + (XFS_SB_FEAT_INCOMPAT_FTYPE) + +#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL +static inline bool +xfs_sb_has_incompat_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT); + return (sbp->sb_features_incompat & feature) != 0; } -#define XFS_SB_VERSION_HASMOREBITS(sbp) xfs_sb_version_hasmorebits(sbp) -static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp) +#define XFS_SB_FEAT_INCOMPAT_LOG_ALL 0 +#define XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_LOG_ALL +static inline bool +xfs_sb_has_incompat_log_feature( + struct xfs_sb *sbp, + __uint32_t feature) { - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \ - ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); + return (sbp->sb_features_log_incompat & feature) != 0; } /* - * sb_features2 bit version macros. - * - * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro: - * - * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp) - * ((XFS_SB_VERSION_HASMOREBITS(sbp) && - * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT) + * V5 superblock specific feature checks */ +static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} + +static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; +} -#define XFS_SB_VERSION_HASATTR2(sbp) xfs_sb_version_hasattr2(sbp) -static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp) +static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) { - return (XFS_SB_VERSION_HASMOREBITS(sbp)) && \ - ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || + (xfs_sb_version_hasmorebits(sbp) && + (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); } -#define XFS_SB_VERSION_ADDATTR2(sbp) xfs_sb_version_addattr2(sbp) -static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) +static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) { - ((sbp)->sb_versionnum = \ - ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT), \ - ((sbp)->sb_features2 = \ - ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT))); + return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && + (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } /* * end of superblock version macros */ +static inline bool +xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) +{ + return (ino == sbp->sb_uquotino || + ino == sbp->sb_gquotino || + ino == sbp->sb_pquotino); +} + #define XFS_SB_DADDR ((xfs_daddr_t)0) /* daddr in filesystem/ag */ #define XFS_SB_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_SB_DADDR) -#define XFS_BUF_TO_SBP(bp) ((xfs_sb_t *)XFS_BUF_PTR(bp)) +#define XFS_BUF_TO_SBP(bp) ((xfs_dsb_t *)((bp)->b_addr)) #define XFS_HDR_BLOCK(mp,d) ((xfs_agblock_t)XFS_BB_TO_FSBT(mp,d)) #define XFS_DADDR_TO_FSB(mp,d) XFS_AGB_TO_FSB(mp, \ - XFS_DADDR_TO_AGNO(mp,d), XFS_DADDR_TO_AGBNO(mp,d)) + xfs_daddr_to_agno(mp,d), xfs_daddr_to_agbno(mp,d)) #define XFS_FSB_TO_DADDR(mp,fsbno) XFS_AGB_TO_DADDR(mp, \ XFS_FSB_TO_AGNO(mp,fsbno), XFS_FSB_TO_AGBNO(mp,fsbno)) @@ -461,15 +584,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) * File system sector to basic block conversions. */ #define XFS_FSS_TO_BB(mp,sec) ((sec) << (mp)->m_sectbb_log) -#define XFS_BB_TO_FSS(mp,bb) \ - (((bb) + (XFS_FSS_TO_BB(mp,1) - 1)) >> (mp)->m_sectbb_log) -#define XFS_BB_TO_FSST(mp,bb) ((bb) >> (mp)->m_sectbb_log) - -/* - * File system sector to byte conversions. - */ -#define XFS_FSS_TO_B(mp,sectno) ((xfs_fsize_t)(sectno) << (mp)->m_sb.sb_sectlog) -#define XFS_B_TO_FSST(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_sectlog) /* * File system block to basic block conversions. @@ -478,7 +592,6 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) #define XFS_BB_TO_FSB(mp,bb) \ (((bb) + (XFS_FSB_TO_BB(mp,1) - 1)) >> (mp)->m_blkbb_log) #define XFS_BB_TO_FSBT(mp,bb) ((bb) >> (mp)->m_blkbb_log) -#define XFS_BB_FSB_OFFSET(mp,bb) ((bb) & ((mp)->m_bsize - 1)) /* * File system block to byte conversions. @@ -489,4 +602,20 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp) #define XFS_B_TO_FSBT(mp,b) (((__uint64_t)(b)) >> (mp)->m_sb.sb_blocklog) #define XFS_B_FSB_OFFSET(mp,b) ((b) & (mp)->m_blockmask) +/* + * perag get/put wrappers for ref counting + */ +extern struct xfs_perag *xfs_perag_get(struct xfs_mount *, xfs_agnumber_t); +extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t, + int tag); +extern void xfs_perag_put(struct xfs_perag *pag); +extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); + +extern void xfs_sb_calc_crc(struct xfs_buf *); +extern void xfs_mod_sb(struct xfs_trans *, __int64_t); +extern void xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *); +extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *); +extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t); +extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); + #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h new file mode 100644 index 00000000000..82404da2ca6 --- /dev/null +++ b/fs/xfs/xfs_shared.h @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2013 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_SHARED_H__ +#define __XFS_SHARED_H__ + +/* + * Definitions shared between kernel and userspace that don't fit into any other + * header file that is shared with userspace. + */ +struct xfs_ifork; +struct xfs_buf; +struct xfs_buf_ops; +struct xfs_mount; +struct xfs_trans; +struct xfs_inode; + +/* + * Buffer verifier operations are widely used, including userspace tools + */ +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_agf_buf_ops; +extern const struct xfs_buf_ops xfs_agfl_buf_ops; +extern const struct xfs_buf_ops xfs_allocbt_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops; +extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +extern const struct xfs_buf_ops xfs_bmbt_buf_ops; +extern const struct xfs_buf_ops xfs_da3_node_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; +extern const struct xfs_buf_ops xfs_agi_buf_ops; +extern const struct xfs_buf_ops xfs_inobt_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ops; +extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_sb_buf_ops; +extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; +extern const struct xfs_buf_ops xfs_symlink_buf_ops; + +/* + * Transaction types. Used to distinguish types of buffers. These never reach + * the log. + */ +#define XFS_TRANS_SETATTR_NOT_SIZE 1 +#define XFS_TRANS_SETATTR_SIZE 2 +#define XFS_TRANS_INACTIVE 3 +#define XFS_TRANS_CREATE 4 +#define XFS_TRANS_CREATE_TRUNC 5 +#define XFS_TRANS_TRUNCATE_FILE 6 +#define XFS_TRANS_REMOVE 7 +#define XFS_TRANS_LINK 8 +#define XFS_TRANS_RENAME 9 +#define XFS_TRANS_MKDIR 10 +#define XFS_TRANS_RMDIR 11 +#define XFS_TRANS_SYMLINK 12 +#define XFS_TRANS_SET_DMATTRS 13 +#define XFS_TRANS_GROWFS 14 +#define XFS_TRANS_STRAT_WRITE 15 +#define XFS_TRANS_DIOSTRAT 16 +/* 17 was XFS_TRANS_WRITE_SYNC */ +#define XFS_TRANS_WRITEID 18 +#define XFS_TRANS_ADDAFORK 19 +#define XFS_TRANS_ATTRINVAL 20 +#define XFS_TRANS_ATRUNCATE 21 +#define XFS_TRANS_ATTR_SET 22 +#define XFS_TRANS_ATTR_RM 23 +#define XFS_TRANS_ATTR_FLAG 24 +#define XFS_TRANS_CLEAR_AGI_BUCKET 25 +#define XFS_TRANS_QM_SBCHANGE 26 +/* + * Dummy entries since we use the transaction type to index into the + * trans_type[] in xlog_recover_print_trans_head() + */ +#define XFS_TRANS_DUMMY1 27 +#define XFS_TRANS_DUMMY2 28 +#define XFS_TRANS_QM_QUOTAOFF 29 +#define XFS_TRANS_QM_DQALLOC 30 +#define XFS_TRANS_QM_SETQLIM 31 +#define XFS_TRANS_QM_DQCLUSTER 32 +#define XFS_TRANS_QM_QINOCREATE 33 +#define XFS_TRANS_QM_QUOTAOFF_END 34 +#define XFS_TRANS_SB_UNIT 35 +#define XFS_TRANS_FSYNC_TS 36 +#define XFS_TRANS_GROWFSRT_ALLOC 37 +#define XFS_TRANS_GROWFSRT_ZERO 38 +#define XFS_TRANS_GROWFSRT_FREE 39 +#define XFS_TRANS_SWAPEXT 40 +#define XFS_TRANS_SB_COUNT 41 +#define XFS_TRANS_CHECKPOINT 42 +#define XFS_TRANS_ICREATE 43 +#define XFS_TRANS_CREATE_TMPFILE 44 +#define XFS_TRANS_TYPE_MAX 44 +/* new transaction types need to be reflected in xfs_logprint(8) */ + +#define XFS_TRANS_TYPES \ + { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \ + { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \ + { XFS_TRANS_INACTIVE, "INACTIVE" }, \ + { XFS_TRANS_CREATE, "CREATE" }, \ + { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \ + { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \ + { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \ + { XFS_TRANS_REMOVE, "REMOVE" }, \ + { XFS_TRANS_LINK, "LINK" }, \ + { XFS_TRANS_RENAME, "RENAME" }, \ + { XFS_TRANS_MKDIR, "MKDIR" }, \ + { XFS_TRANS_RMDIR, "RMDIR" }, \ + { XFS_TRANS_SYMLINK, "SYMLINK" }, \ + { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \ + { XFS_TRANS_GROWFS, "GROWFS" }, \ + { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \ + { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \ + { XFS_TRANS_WRITEID, "WRITEID" }, \ + { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \ + { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \ + { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \ + { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \ + { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \ + { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \ + { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \ + { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \ + { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \ + { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \ + { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \ + { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \ + { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \ + { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \ + { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \ + { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \ + { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \ + { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \ + { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ + { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ + { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ + { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \ + { XFS_TRANS_DUMMY1, "DUMMY1" }, \ + { XFS_TRANS_DUMMY2, "DUMMY2" }, \ + { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } + +/* + * This structure is used to track log items associated with + * a transaction. It points to the log item and keeps some + * flags to track the state of the log item. It also tracks + * the amount of space needed to log the item it describes + * once we get to commit processing (see xfs_trans_commit()). + */ +struct xfs_log_item_desc { + struct xfs_log_item *lid_item; + struct list_head lid_trans; + unsigned char lid_flags; +}; + +#define XFS_LID_DIRTY 0x1 + +/* log size calculation functions */ +int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); +int xfs_log_calc_minimum_size(struct xfs_mount *); + + +/* + * Values for t_flags. + */ +#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ +#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ +#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ +#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ +#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ +#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ +#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer + count in superblock */ +/* + * Values for call flags parameter. + */ +#define XFS_TRANS_RELEASE_LOG_RES 0x4 +#define XFS_TRANS_ABORT 0x8 + +/* + * Field values for xfs_trans_mod_sb. + */ +#define XFS_TRANS_SB_ICOUNT 0x00000001 +#define XFS_TRANS_SB_IFREE 0x00000002 +#define XFS_TRANS_SB_FDBLOCKS 0x00000004 +#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 +#define XFS_TRANS_SB_FREXTENTS 0x00000010 +#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 +#define XFS_TRANS_SB_DBLOCKS 0x00000040 +#define XFS_TRANS_SB_AGCOUNT 0x00000080 +#define XFS_TRANS_SB_IMAXPCT 0x00000100 +#define XFS_TRANS_SB_REXTSIZE 0x00000200 +#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 +#define XFS_TRANS_SB_RBLOCKS 0x00000800 +#define XFS_TRANS_SB_REXTENTS 0x00001000 +#define XFS_TRANS_SB_REXTSLOG 0x00002000 + +/* + * Here we centralize the specification of XFS meta-data buffer reference count + * values. This determines how hard the buffer cache tries to hold onto the + * buffer. + */ +#define XFS_AGF_REF 4 +#define XFS_AGI_REF 4 +#define XFS_AGFL_REF 3 +#define XFS_INO_BTREE_REF 3 +#define XFS_ALLOC_BTREE_REF 2 +#define XFS_BMAP_BTREE_REF 2 +#define XFS_DIR_BTREE_REF 2 +#define XFS_INO_REF 2 +#define XFS_ATTR_BTREE_REF 1 +#define XFS_DQUOT_REF 1 + +/* + * Flags for xfs_trans_ichgtime(). + */ +#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */ +#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */ +#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */ + + +/* + * Symlink decoding/encoding functions + */ +int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen); +int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset, + uint32_t size, struct xfs_buf *bp); +void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp, + struct xfs_inode *ip, struct xfs_ifork *ifp); + +#endif /* __XFS_SHARED_H__ */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c new file mode 100644 index 00000000000..f2240383d4b --- /dev/null +++ b/fs/xfs/xfs_stats.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include <linux/proc_fs.h> + +DEFINE_PER_CPU(struct xfsstats, xfsstats); + +static int counter_val(int idx) +{ + int val = 0, cpu; + + for_each_possible_cpu(cpu) + val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + return val; +} + +static int xfs_stat_proc_show(struct seq_file *m, void *v) +{ + int i, j; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static const struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, + }; + + /* Loop over all stats groups */ + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { + seq_printf(m, "%s", xstats[i].desc); + /* inner loop does each group */ + for (; j < xstats[i].endpoint; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + } + /* extra precision counters */ + for_each_possible_cpu(i) { + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + seq_printf(m, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + seq_printf(m, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + return 0; +} + +static int xfs_stat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xfs_stat_proc_show, NULL); +} + +static const struct file_operations xfs_stat_proc_fops = { + .owner = THIS_MODULE, + .open = xfs_stat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota interfaces */ +#ifdef CONFIG_XFS_QUOTA +static int xqm_proc_show(struct seq_file *m, void *v) +{ + /* maximum; incore; ratio free to inuse; freelist */ + seq_printf(m, "%d\t%d\t%d\t%u\n", + 0, + counter_val(XFSSTAT_END_XQMSTAT), + 0, + counter_val(XFSSTAT_END_XQMSTAT + 1)); + return 0; +} + +static int xqm_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqm_proc_show, NULL); +} + +static const struct file_operations xqm_proc_fops = { + .owner = THIS_MODULE, + .open = xqm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* legacy quota stats interface no 2 */ +static int xqmstat_proc_show(struct seq_file *m, void *v) +{ + int j; + + seq_printf(m, "qm"); + for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) + seq_printf(m, " %u", counter_val(j)); + seq_putc(m, '\n'); + return 0; +} + +static int xqmstat_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, xqmstat_proc_show, NULL); +} + +static const struct file_operations xqmstat_proc_fops = { + .owner = THIS_MODULE, + .open = xqmstat_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_XFS_QUOTA */ + +int +xfs_init_procfs(void) +{ + if (!proc_mkdir("fs/xfs", NULL)) + goto out; + + if (!proc_create("fs/xfs/stat", 0, NULL, + &xfs_stat_proc_fops)) + goto out_remove_xfs_dir; +#ifdef CONFIG_XFS_QUOTA + if (!proc_create("fs/xfs/xqmstat", 0, NULL, + &xqmstat_proc_fops)) + goto out_remove_stat_file; + if (!proc_create("fs/xfs/xqm", 0, NULL, + &xqm_proc_fops)) + goto out_remove_xqmstat_file; +#endif + return 0; + +#ifdef CONFIG_XFS_QUOTA + out_remove_xqmstat_file: + remove_proc_entry("fs/xfs/xqmstat", NULL); + out_remove_stat_file: + remove_proc_entry("fs/xfs/stat", NULL); +#endif + out_remove_xfs_dir: + remove_proc_entry("fs/xfs", NULL); + out: + return -ENOMEM; +} + +void +xfs_cleanup_procfs(void) +{ +#ifdef CONFIG_XFS_QUOTA + remove_proc_entry("fs/xfs/xqm", NULL); + remove_proc_entry("fs/xfs/xqmstat", NULL); +#endif + remove_proc_entry("fs/xfs/stat", NULL); + remove_proc_entry("fs/xfs", NULL); +} diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/xfs_stats.h index 8ba7a2fa6c1..c8f238b8299 100644 --- a/fs/xfs/linux-2.6/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -118,6 +118,97 @@ struct xfsstats { __uint32_t xb_page_retries; __uint32_t xb_page_found; __uint32_t xb_get_read; +/* Version 2 btree counters */ +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) + __uint32_t xs_abtb_2_lookup; + __uint32_t xs_abtb_2_compare; + __uint32_t xs_abtb_2_insrec; + __uint32_t xs_abtb_2_delrec; + __uint32_t xs_abtb_2_newroot; + __uint32_t xs_abtb_2_killroot; + __uint32_t xs_abtb_2_increment; + __uint32_t xs_abtb_2_decrement; + __uint32_t xs_abtb_2_lshift; + __uint32_t xs_abtb_2_rshift; + __uint32_t xs_abtb_2_split; + __uint32_t xs_abtb_2_join; + __uint32_t xs_abtb_2_alloc; + __uint32_t xs_abtb_2_free; + __uint32_t xs_abtb_2_moves; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) + __uint32_t xs_abtc_2_lookup; + __uint32_t xs_abtc_2_compare; + __uint32_t xs_abtc_2_insrec; + __uint32_t xs_abtc_2_delrec; + __uint32_t xs_abtc_2_newroot; + __uint32_t xs_abtc_2_killroot; + __uint32_t xs_abtc_2_increment; + __uint32_t xs_abtc_2_decrement; + __uint32_t xs_abtc_2_lshift; + __uint32_t xs_abtc_2_rshift; + __uint32_t xs_abtc_2_split; + __uint32_t xs_abtc_2_join; + __uint32_t xs_abtc_2_alloc; + __uint32_t xs_abtc_2_free; + __uint32_t xs_abtc_2_moves; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) + __uint32_t xs_bmbt_2_lookup; + __uint32_t xs_bmbt_2_compare; + __uint32_t xs_bmbt_2_insrec; + __uint32_t xs_bmbt_2_delrec; + __uint32_t xs_bmbt_2_newroot; + __uint32_t xs_bmbt_2_killroot; + __uint32_t xs_bmbt_2_increment; + __uint32_t xs_bmbt_2_decrement; + __uint32_t xs_bmbt_2_lshift; + __uint32_t xs_bmbt_2_rshift; + __uint32_t xs_bmbt_2_split; + __uint32_t xs_bmbt_2_join; + __uint32_t xs_bmbt_2_alloc; + __uint32_t xs_bmbt_2_free; + __uint32_t xs_bmbt_2_moves; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) + __uint32_t xs_ibt_2_lookup; + __uint32_t xs_ibt_2_compare; + __uint32_t xs_ibt_2_insrec; + __uint32_t xs_ibt_2_delrec; + __uint32_t xs_ibt_2_newroot; + __uint32_t xs_ibt_2_killroot; + __uint32_t xs_ibt_2_increment; + __uint32_t xs_ibt_2_decrement; + __uint32_t xs_ibt_2_lshift; + __uint32_t xs_ibt_2_rshift; + __uint32_t xs_ibt_2_split; + __uint32_t xs_ibt_2_join; + __uint32_t xs_ibt_2_alloc; + __uint32_t xs_ibt_2_free; + __uint32_t xs_ibt_2_moves; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) + __uint32_t xs_fibt_2_lookup; + __uint32_t xs_fibt_2_compare; + __uint32_t xs_fibt_2_insrec; + __uint32_t xs_fibt_2_delrec; + __uint32_t xs_fibt_2_newroot; + __uint32_t xs_fibt_2_killroot; + __uint32_t xs_fibt_2_increment; + __uint32_t xs_fibt_2_decrement; + __uint32_t xs_fibt_2_lshift; + __uint32_t xs_fibt_2_rshift; + __uint32_t xs_fibt_2_split; + __uint32_t xs_fibt_2_join; + __uint32_t xs_fibt_2_alloc; + __uint32_t xs_fibt_2_free; + __uint32_t xs_fibt_2_moves; +#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6) + __uint32_t xs_qm_dqreclaims; + __uint32_t xs_qm_dqreclaim_misses; + __uint32_t xs_qm_dquot_dups; + __uint32_t xs_qm_dqcachemisses; + __uint32_t xs_qm_dqcachehits; + __uint32_t xs_qm_dqwants; +#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2) + __uint32_t xs_qm_dquot; + __uint32_t xs_qm_dquot_unused; /* Extra precision counters */ __uint64_t xs_xstrat_bytes; __uint64_t xs_write_bytes; @@ -134,7 +225,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats); #define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) #define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) -extern void xfs_init_procfs(void); +extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); @@ -144,8 +235,14 @@ extern void xfs_cleanup_procfs(void); # define XFS_STATS_DEC(count) # define XFS_STATS_ADD(count, inc) -static __inline void xfs_init_procfs(void) { }; -static __inline void xfs_cleanup_procfs(void) { }; +static inline int xfs_init_procfs(void) +{ + return 0; +} + +static inline void xfs_cleanup_procfs(void) +{ +} #endif /* !CONFIG_PROC_FS */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c new file mode 100644 index 00000000000..8f0333b3f7a --- /dev/null +++ b/fs/xfs/xfs_super.c @@ -0,0 +1,1809 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_inum.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_fsops.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_extfree_item.h" +#include "xfs_mru_cache.h" +#include "xfs_inode_item.h" +#include "xfs_icache.h" +#include "xfs_trace.h" +#include "xfs_icreate_item.h" +#include "xfs_dinode.h" +#include "xfs_filestream.h" +#include "xfs_quota.h" + +#include <linux/namei.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/mempool.h> +#include <linux/writeback.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/parser.h> + +static const struct super_operations xfs_super_operations; +static kmem_zone_t *xfs_ioend_zone; +mempool_t *xfs_ioend_pool; + +#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ +#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ +#define MNTOPT_LOGDEV "logdev" /* log device */ +#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ +#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ +#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ +#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ +#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ +#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ +#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ +#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ +#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ +#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ +#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ +#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ +#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ +#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ +#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ +#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and + * unwritten extent conversion */ +#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ +#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_32BITINODE "inode32" /* inode allocation limited to + * XFS_MAXINUMBER_32 */ +#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ +#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ +#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ +#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes + * in stat(). */ +#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ +#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ +#define MNTOPT_FILESTREAM "filestreams" /* use filestreams allocator */ +#define MNTOPT_QUOTA "quota" /* disk quotas (user) */ +#define MNTOPT_NOQUOTA "noquota" /* no quotas */ +#define MNTOPT_USRQUOTA "usrquota" /* user quota enabled */ +#define MNTOPT_GRPQUOTA "grpquota" /* group quota enabled */ +#define MNTOPT_PRJQUOTA "prjquota" /* project quota enabled */ +#define MNTOPT_UQUOTA "uquota" /* user quota (IRIX variant) */ +#define MNTOPT_GQUOTA "gquota" /* group quota (IRIX variant) */ +#define MNTOPT_PQUOTA "pquota" /* project quota (IRIX variant) */ +#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */ +#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ +#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ +#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ +#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ +#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ +#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ +#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ + +/* + * Table driven mount option parser. + * + * Currently only used for remount, but it will be used for mount + * in the future, too. + */ +enum { + Opt_barrier, + Opt_nobarrier, + Opt_inode64, + Opt_inode32, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_barrier, "barrier"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_inode64, "inode64"}, + {Opt_inode32, "inode32"}, + {Opt_err, NULL} +}; + + +STATIC unsigned long +suffix_kstrtoint(char *s, unsigned int base, int *res) +{ + int last, shift_left_factor = 0, _res; + char *value = s; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoint(s, base, &_res)) + return -EINVAL; + *res = _res << shift_left_factor; + return 0; +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock has _not_ yet been read in. + * + * Note that this function leaks the various device name allocations on + * failure. The caller takes care of them. + */ +STATIC int +xfs_parseargs( + struct xfs_mount *mp, + char *options) +{ + struct super_block *sb = mp->m_super; + char *this_char, *value; + int dsunit = 0; + int dswidth = 0; + int iosize = 0; + __uint8_t iosizelog = 0; + + /* + * set up the mount name first so all the errors will refer to the + * correct device. + */ + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + /* + * Copy binary VFS mount flags we are interested in. + */ + if (sb->s_flags & MS_RDONLY) + mp->m_flags |= XFS_MOUNT_RDONLY; + if (sb->s_flags & MS_DIRSYNC) + mp->m_flags |= XFS_MOUNT_DIRSYNC; + if (sb->s_flags & MS_SYNCHRONOUS) + mp->m_flags |= XFS_MOUNT_WSYNC; + + /* + * Set some default flags that could be cleared by the mount option + * parsing. + */ + mp->m_flags |= XFS_MOUNT_BARRIER; + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; +#if !XFS_BIG_INUMS + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; +#endif + + /* + * These can be overridden by the mount option parsing. + */ + mp->m_logbufs = -1; + mp->m_logbsize = -1; + + if (!options) + goto done; + + while ((this_char = strsep(&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr(this_char, '=')) != NULL) + *value++ = 0; + + if (!strcmp(this_char, MNTOPT_LOGBUFS)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (kstrtoint(value, 10, &mp->m_logbufs)) + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_logname) + return ENOMEM; + } else if (!strcmp(this_char, MNTOPT_MTPT)) { + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_RTDEV)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_rtname) + return ENOMEM; + } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (kstrtoint(value, 10, &iosize)) + return EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (suffix_kstrtoint(value, 10, &iosize)) + return EINVAL; + iosizelog = ffs(iosize) - 1; + } else if (!strcmp(this_char, MNTOPT_GRPID) || + !strcmp(this_char, MNTOPT_BSDGROUPS)) { + mp->m_flags |= XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_NOGRPID) || + !strcmp(this_char, MNTOPT_SYSVGROUPS)) { + mp->m_flags &= ~XFS_MOUNT_GRPID; + } else if (!strcmp(this_char, MNTOPT_WSYNC)) { + mp->m_flags |= XFS_MOUNT_WSYNC; + } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { + mp->m_flags |= XFS_MOUNT_NORECOVERY; + } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { + mp->m_flags |= XFS_MOUNT_NOALIGN; + } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { + mp->m_flags |= XFS_MOUNT_SWALLOC; + } else if (!strcmp(this_char, MNTOPT_SUNIT)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (kstrtoint(value, 10, &dsunit)) + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { + if (!value || !*value) { + xfs_warn(mp, "%s option requires an argument", + this_char); + return EINVAL; + } + if (kstrtoint(value, 10, &dswidth)) + return EINVAL; + } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { + mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { + mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; +#if !XFS_BIG_INUMS + xfs_warn(mp, "%s option not allowed on this system", + this_char); + return EINVAL; +#endif + } else if (!strcmp(this_char, MNTOPT_NOUUID)) { + mp->m_flags |= XFS_MOUNT_NOUUID; + } else if (!strcmp(this_char, MNTOPT_BARRIER)) { + mp->m_flags |= XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + mp->m_flags |= XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { + mp->m_flags &= ~XFS_MOUNT_IKEEP; + } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { + mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { + mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; + } else if (!strcmp(this_char, MNTOPT_ATTR2)) { + mp->m_flags |= XFS_MOUNT_ATTR2; + } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) { + mp->m_flags |= XFS_MOUNT_FILESTREAMS; + } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) { + mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; + mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; + mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; + } else if (!strcmp(this_char, MNTOPT_QUOTA) || + !strcmp(this_char, MNTOPT_UQUOTA) || + !strcmp(this_char, MNTOPT_USRQUOTA)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | + XFS_UQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) || + !strcmp(this_char, MNTOPT_UQUOTANOENF)) { + mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_UQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_PQUOTA) || + !strcmp(this_char, MNTOPT_PRJQUOTA)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | + XFS_PQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) { + mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_PQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_GQUOTA) || + !strcmp(this_char, MNTOPT_GRPQUOTA)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | + XFS_GQUOTA_ENFD); + } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { + mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + mp->m_qflags &= ~XFS_GQUOTA_ENFD; + } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { + xfs_warn(mp, + "delaylog is the default now, option is deprecated."); + } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { + xfs_warn(mp, + "nodelaylog support has been removed, option is deprecated."); + } else if (!strcmp(this_char, MNTOPT_DISCARD)) { + mp->m_flags |= XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { + mp->m_flags &= ~XFS_MOUNT_DISCARD; + } else if (!strcmp(this_char, "ihashsize")) { + xfs_warn(mp, + "ihashsize no longer used, option is deprecated."); + } else if (!strcmp(this_char, "osyncisdsync")) { + xfs_warn(mp, + "osyncisdsync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "osyncisosync")) { + xfs_warn(mp, + "osyncisosync has no effect, option is deprecated."); + } else if (!strcmp(this_char, "irixsgid")) { + xfs_warn(mp, + "irixsgid is now a sysctl(2) variable, option is deprecated."); + } else { + xfs_warn(mp, "unknown mount option [%s].", this_char); + return EINVAL; + } + } + + /* + * no recovery flag requires a read-only mount + */ + if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && + !(mp->m_flags & XFS_MOUNT_RDONLY)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return EINVAL; + } + + if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); + return EINVAL; + } + +#ifndef CONFIG_XFS_QUOTA + if (XFS_IS_QUOTA_RUNNING(mp)) { + xfs_warn(mp, "quota support not available in this kernel."); + return EINVAL; + } +#endif + + if ((dsunit && !dswidth) || (!dsunit && dswidth)) { + xfs_warn(mp, "sunit and swidth must be specified together"); + return EINVAL; + } + + if (dsunit && (dswidth % dsunit != 0)) { + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", + dswidth, dsunit); + return EINVAL; + } + +done: + if (dsunit && !(mp->m_flags & XFS_MOUNT_NOALIGN)) { + /* + * At this point the superblock has not been read + * in, therefore we do not know the block size. + * Before the mount call ends we will convert + * these to FSBs. + */ + mp->m_dalign = dsunit; + mp->m_swidth = dswidth; + } + + if (mp->m_logbufs != -1 && + mp->m_logbufs != 0 && + (mp->m_logbufs < XLOG_MIN_ICLOGS || + mp->m_logbufs > XLOG_MAX_ICLOGS)) { + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", + mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); + return XFS_ERROR(EINVAL); + } + if (mp->m_logbsize != -1 && + mp->m_logbsize != 0 && + (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || + mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || + !is_power_of_2(mp->m_logbsize))) { + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + mp->m_logbsize); + return XFS_ERROR(EINVAL); + } + + if (iosizelog) { + if (iosizelog > XFS_MAX_IO_LOG || + iosizelog < XFS_MIN_IO_LOG) { + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", + iosizelog, XFS_MIN_IO_LOG, + XFS_MAX_IO_LOG); + return XFS_ERROR(EINVAL); + } + + mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; + mp->m_readio_log = iosizelog; + mp->m_writeio_log = iosizelog; + } + + return 0; +} + +struct proc_xfs_info { + int flag; + char *str; +}; + +STATIC int +xfs_showargs( + struct xfs_mount *mp, + struct seq_file *m) +{ + static struct proc_xfs_info xfs_info_set[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_IKEEP, "," MNTOPT_IKEEP }, + { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, + { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, + { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, + { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, + { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, + { XFS_MOUNT_ATTR2, "," MNTOPT_ATTR2 }, + { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, + { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, + { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { 0, NULL } + }; + static struct proc_xfs_info xfs_info_unset[] = { + /* the few simple ones we can get from the mount struct */ + { XFS_MOUNT_COMPAT_IOSIZE, "," MNTOPT_LARGEIO }, + { XFS_MOUNT_BARRIER, "," MNTOPT_NOBARRIER }, + { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_64BITINODE }, + { 0, NULL } + }; + struct proc_xfs_info *xfs_infop; + + for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { + if (mp->m_flags & xfs_infop->flag) + seq_puts(m, xfs_infop->str); + } + for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) { + if (!(mp->m_flags & xfs_infop->flag)) + seq_puts(m, xfs_infop->str); + } + + if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) + seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", + (int)(1 << mp->m_writeio_log) >> 10); + + if (mp->m_logbufs > 0) + seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); + if (mp->m_logbsize > 0) + seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); + + if (mp->m_logname) + seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); + if (mp->m_rtname) + seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); + + if (mp->m_dalign > 0) + seq_printf(m, "," MNTOPT_SUNIT "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); + if (mp->m_swidth > 0) + seq_printf(m, "," MNTOPT_SWIDTH "=%d", + (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); + + if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD)) + seq_puts(m, "," MNTOPT_USRQUOTA); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, "," MNTOPT_UQUOTANOENF); + + if (mp->m_qflags & XFS_PQUOTA_ACCT) { + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, "," MNTOPT_PRJQUOTA); + else + seq_puts(m, "," MNTOPT_PQUOTANOENF); + } + if (mp->m_qflags & XFS_GQUOTA_ACCT) { + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, "," MNTOPT_GRPQUOTA); + else + seq_puts(m, "," MNTOPT_GQUOTANOENF); + } + + if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) + seq_puts(m, "," MNTOPT_NOQUOTA); + + return 0; +} +__uint64_t +xfs_max_file_offset( + unsigned int blockshift) +{ + unsigned int pagefactor = 1; + unsigned int bitshift = BITS_PER_LONG - 1; + + /* Figure out maximum filesize, on Linux this can depend on + * the filesystem blocksize (on 32 bit platforms). + * __block_write_begin does this in an [unsigned] long... + * page->index << (PAGE_CACHE_SHIFT - bbits) + * So, for page sized blocks (4K on 32 bit platforms), + * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is + * (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) + * but for smaller blocksizes it is less (bbits = log2 bsize). + * Note1: get_block_t takes a long (implicit cast from above) + * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch + * can optionally convert the [unsigned] long from above into + * an [unsigned] long long. + */ + +#if BITS_PER_LONG == 32 +# if defined(CONFIG_LBDAF) + ASSERT(sizeof(sector_t) == 8); + pagefactor = PAGE_CACHE_SIZE; + bitshift = BITS_PER_LONG; +# else + pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift); +# endif +#endif + + return (((__uint64_t)pagefactor) << bitshift) - 1; +} + +xfs_agnumber_t +xfs_set_inode32(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + xfs_agnumber_t maxagi = 0; + xfs_sb_t *sbp = &mp->m_sb; + xfs_agnumber_t max_metadata; + xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); + xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); + xfs_perag_t *pag; + + /* Calculate how much should be reserved for inodes to meet + * the max inode percentage. + */ + if (mp->m_maxicount) { + __uint64_t icount; + + icount = sbp->sb_dblocks * sbp->sb_imax_pct; + do_div(icount, 100); + icount += sbp->sb_agblocks - 1; + do_div(icount, sbp->sb_agblocks); + max_metadata = icount; + } else { + max_metadata = sbp->sb_agcount; + } + + for (index = 0; index < sbp->sb_agcount; index++) { + ino = XFS_AGINO_TO_INO(mp, index, agino); + + if (ino > XFS_MAXINUMBER_32) { + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 0; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + continue; + } + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + maxagi++; + if (index < max_metadata) + pag->pagf_metadata = 1; + xfs_perag_put(pag); + } + mp->m_flags |= (XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + + return maxagi; +} + +xfs_agnumber_t +xfs_set_inode64(struct xfs_mount *mp) +{ + xfs_agnumber_t index = 0; + + for (index = 0; index < mp->m_sb.sb_agcount; index++) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, index); + pag->pagi_inodeok = 1; + pag->pagf_metadata = 0; + xfs_perag_put(pag); + } + + /* There is no need for lock protection on m_flags, + * the rw_semaphore of the VFS superblock is locked + * during mount/umount/remount operations, so this is + * enough to avoid concurency on the m_flags field + */ + mp->m_flags &= ~(XFS_MOUNT_32BITINODES | + XFS_MOUNT_SMALL_INUMS); + return index; +} + +STATIC int +xfs_blkdev_get( + xfs_mount_t *mp, + const char *name, + struct block_device **bdevp) +{ + int error = 0; + + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); + if (IS_ERR(*bdevp)) { + error = PTR_ERR(*bdevp); + xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); + } + + return -error; +} + +STATIC void +xfs_blkdev_put( + struct block_device *bdev) +{ + if (bdev) + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); +} + +void +xfs_blkdev_issue_flush( + xfs_buftarg_t *buftarg) +{ + blkdev_issue_flush(buftarg->bt_bdev, GFP_NOFS, NULL); +} + +STATIC void +xfs_close_devices( + struct xfs_mount *mp) +{ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); + xfs_blkdev_put(logdev); + } + if (mp->m_rtdev_targp) { + struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); + xfs_blkdev_put(rtdev); + } + xfs_free_buftarg(mp, mp->m_ddev_targp); +} + +/* + * The file system configurations are: + * (1) device (partition) with data and internal log + * (2) logical volume with data and log subvolumes. + * (3) logical volume with data, log, and realtime subvolumes. + * + * We only have to handle opening the log and realtime volumes here if + * they are present. The data subvolume has already been opened by + * get_sb_bdev() and is stored in sb->s_bdev. + */ +STATIC int +xfs_open_devices( + struct xfs_mount *mp) +{ + struct block_device *ddev = mp->m_super->s_bdev; + struct block_device *logdev = NULL, *rtdev = NULL; + int error; + + /* + * Open real time and log devices - order is important. + */ + if (mp->m_logname) { + error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + if (error) + goto out; + } + + if (mp->m_rtname) { + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + if (error) + goto out_close_logdev; + + if (rtdev == ddev || rtdev == logdev) { + xfs_warn(mp, + "Cannot mount filesystem with identical rtdev and ddev/logdev."); + error = EINVAL; + goto out_close_rtdev; + } + } + + /* + * Setup xfs_mount buffer target pointers + */ + error = ENOMEM; + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + if (!mp->m_ddev_targp) + goto out_close_rtdev; + + if (rtdev) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (!mp->m_rtdev_targp) + goto out_free_ddev_targ; + } + + if (logdev && logdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (!mp->m_logdev_targp) + goto out_free_rtdev_targ; + } else { + mp->m_logdev_targp = mp->m_ddev_targp; + } + + return 0; + + out_free_rtdev_targ: + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp, mp->m_rtdev_targp); + out_free_ddev_targ: + xfs_free_buftarg(mp, mp->m_ddev_targp); + out_close_rtdev: + if (rtdev) + xfs_blkdev_put(rtdev); + out_close_logdev: + if (logdev && logdev != ddev) + xfs_blkdev_put(logdev); + out: + return error; +} + +/* + * Setup xfs_mount buffer target pointers based on superblock + */ +STATIC int +xfs_setup_devices( + struct xfs_mount *mp) +{ + int error; + + error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + if (error) + return error; + + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { + unsigned int log_sector_size = BBSIZE; + + if (xfs_sb_version_hassector(&mp->m_sb)) + log_sector_size = mp->m_sb.sb_logsectsize; + error = xfs_setsize_buftarg(mp->m_logdev_targp, + log_sector_size); + if (error) + return error; + } + if (mp->m_rtdev_targp) { + error = xfs_setsize_buftarg(mp->m_rtdev_targp, + mp->m_sb.sb_sectsize); + if (error) + return error; + } + + return 0; +} + +STATIC int +xfs_init_mount_workqueues( + struct xfs_mount *mp) +{ + mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_data_workqueue) + goto out; + + mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_unwritten_workqueue) + goto out_destroy_data_iodone_queue; + + mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_cil_workqueue) + goto out_destroy_unwritten; + + mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", + 0, 0, mp->m_fsname); + if (!mp->m_reclaim_workqueue) + goto out_destroy_cil; + + mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", + 0, 0, mp->m_fsname); + if (!mp->m_log_workqueue) + goto out_destroy_reclaim; + + mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", + 0, 0, mp->m_fsname); + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + + return 0; + +out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); +out_destroy_reclaim: + destroy_workqueue(mp->m_reclaim_workqueue); +out_destroy_cil: + destroy_workqueue(mp->m_cil_workqueue); +out_destroy_unwritten: + destroy_workqueue(mp->m_unwritten_workqueue); +out_destroy_data_iodone_queue: + destroy_workqueue(mp->m_data_workqueue); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_mount_workqueues( + struct xfs_mount *mp) +{ + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); + destroy_workqueue(mp->m_cil_workqueue); + destroy_workqueue(mp->m_data_workqueue); + destroy_workqueue(mp->m_unwritten_workqueue); +} + +/* + * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK + * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting + * for IO to complete so that we effectively throttle multiple callers to the + * rate at which IO is completing. + */ +void +xfs_flush_inodes( + struct xfs_mount *mp) +{ + struct super_block *sb = mp->m_super; + + if (down_read_trylock(&sb->s_umount)) { + sync_inodes_sb(sb); + up_read(&sb->s_umount); + } +} + +/* Catch misguided souls that try to use this interface on XFS */ +STATIC struct inode * +xfs_fs_alloc_inode( + struct super_block *sb) +{ + BUG(); + return NULL; +} + +/* + * Now that the generic code is guaranteed not to be accessing + * the linux inode, we can reclaim the inode. + */ +STATIC void +xfs_fs_destroy_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + trace_xfs_destroy_inode(ip); + + XFS_STATS_INC(vn_reclaim); + + ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); + + /* + * We should never get here with one of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); + + /* + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. + */ + xfs_inode_set_reclaim_tag(ip); +} + +/* + * Slab object creation initialisation for the XFS inode. + * This covers only the idempotent fields in the XFS inode; + * all other fields need to be initialised on allocation + * from the slab. This avoids the need to repeatedly initialise + * fields in the xfs inode that left in the initialise state + * when freeing the inode. + */ +STATIC void +xfs_fs_inode_init_once( + void *inode) +{ + struct xfs_inode *ip = inode; + + memset(ip, 0, sizeof(struct xfs_inode)); + + /* vfs inode */ + inode_init_once(VFS_I(ip)); + + /* xfs inode */ + atomic_set(&ip->i_pincount, 0); + spin_lock_init(&ip->i_flags_lock); + + mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); +} + +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + xfs_inode_t *ip = XFS_I(inode); + + ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + + trace_xfs_evict_inode(ip); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); + XFS_STATS_INC(vn_rele); + XFS_STATS_INC(vn_remove); + XFS_STATS_DEC(vn_active); + + xfs_inactive(ip); +} + +/* + * We do an unlocked check for XFS_IDONTCACHE here because we are already + * serialised against cache hits here via the inode->i_lock and igrab() in + * xfs_iget_cache_hit(). Hence a lookup that might clear this flag will not be + * racing with us, and it avoids needing to grab a spinlock here for every inode + * we drop the final reference on. + */ +STATIC int +xfs_fs_drop_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + + return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE); +} + +STATIC void +xfs_free_fsname( + struct xfs_mount *mp) +{ + kfree(mp->m_fsname); + kfree(mp->m_rtname); + kfree(mp->m_logname); +} + +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_icsb_destroy_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + +STATIC int +xfs_fs_sync_fs( + struct super_block *sb, + int wait) +{ + struct xfs_mount *mp = XFS_M(sb); + + /* + * Doing anything during the async pass would be counterproductive. + */ + if (!wait) + return 0; + + xfs_log_force(mp, XFS_LOG_SYNC); + if (laptop_mode) { + /* + * The disk must be active because we're syncing. + * We schedule log work now (now that the disk is + * active) instead of later (when it might not be). + */ + flush_delayed_work(&mp->m_log->l_work); + } + + return 0; +} + +STATIC int +xfs_fs_statfs( + struct dentry *dentry, + struct kstatfs *statp) +{ + struct xfs_mount *mp = XFS_M(dentry->d_sb); + xfs_sb_t *sbp = &mp->m_sb; + struct xfs_inode *ip = XFS_I(dentry->d_inode); + __uint64_t fakeinos, id; + xfs_extlen_t lsize; + __int64_t ffree; + + statp->f_type = XFS_SB_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + id = huge_encode_dev(mp->m_ddev_targp->bt_dev); + statp->f_fsid.val[0] = (u32)id; + statp->f_fsid.val[1] = (u32)(id >> 32); + + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + + spin_lock(&mp->m_sb_lock); + statp->f_bsize = sbp->sb_blocksize; + lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; + statp->f_blocks = sbp->sb_dblocks - lsize; + statp->f_bfree = statp->f_bavail = + sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + fakeinos = statp->f_bfree << sbp->sb_inopblog; + statp->f_files = + MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + if (mp->m_maxicount) + statp->f_files = min_t(typeof(statp->f_files), + statp->f_files, + mp->m_maxicount); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + + spin_unlock(&mp->m_sb_lock); + + if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == + (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD)) + xfs_qm_statvfs(ip, statp); + return 0; +} + +STATIC void +xfs_save_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks = 0; + + mp->m_resblks_save = mp->m_resblks; + xfs_reserve_blocks(mp, &resblks, NULL); +} + +STATIC void +xfs_restore_resvblks(struct xfs_mount *mp) +{ + __uint64_t resblks; + + if (mp->m_resblks_save) { + resblks = mp->m_resblks_save; + mp->m_resblks_save = 0; + } else + resblks = xfs_default_resblks(mp); + + xfs_reserve_blocks(mp, &resblks, NULL); +} + +/* + * Trigger writeback of all the dirty metadata in the file system. + * + * This ensures that the metadata is written to their location on disk rather + * than just existing in transactions in the log. This means after a quiesce + * there is no log replay required to write the inodes to disk - this is the + * primary difference between a sync and a quiesce. + * + * Note: xfs_log_quiesce() stops background log work - the callers must ensure + * it is started again when appropriate. + */ +static void +xfs_quiesce_attr( + struct xfs_mount *mp) +{ + int error = 0; + + /* wait for all modifications to complete */ + while (atomic_read(&mp->m_active_trans) > 0) + delay(100); + + /* force the log to unpin objects from the now complete transactions */ + xfs_log_force(mp, XFS_LOG_SYNC); + + /* reclaim inodes to do any IO before the freeze completes */ + xfs_reclaim_inodes(mp, 0); + xfs_reclaim_inodes(mp, SYNC_WAIT); + + /* Push the superblock and write an unmount record */ + error = xfs_log_sbcount(mp); + if (error) + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " + "Frozen image may not be consistent."); + /* + * Just warn here till VFS can correctly support + * read-only remount without racing. + */ + WARN_ON(atomic_read(&mp->m_active_trans) != 0); + + xfs_log_quiesce(mp); +} + +STATIC int +xfs_fs_remount( + struct super_block *sb, + int *flags, + char *options) +{ + struct xfs_mount *mp = XFS_M(sb); + substring_t args[MAX_OPT_ARGS]; + char *p; + int error; + + sync_filesystem(sb); + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_barrier: + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; + case Opt_inode64: + mp->m_maxagi = xfs_set_inode64(mp); + break; + case Opt_inode32: + mp->m_maxagi = xfs_set_inode32(mp); + break; + default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 + xfs_info(mp, + "mount option \"%s\" not supported for remount", p); + return -EINVAL; +#else + break; +#endif + } + } + + /* ro -> rw */ + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + mp->m_flags &= ~XFS_MOUNT_RDONLY; + + /* + * If this is the first remount to writeable state we + * might have some superblock changes to update. + */ + if (mp->m_update_flags) { + error = xfs_mount_log_sb(mp, mp->m_update_flags); + if (error) { + xfs_warn(mp, "failed to write sb changes"); + return error; + } + mp->m_update_flags = 0; + } + + /* + * Fill out the reserve pool if it is empty. Use the stashed + * value if it is non-zero, otherwise go with the default. + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + } + + /* rw -> ro */ + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + /* + * Before we sync the metadata, we need to free up the reserve + * block pool so that the used block count in the superblock on + * disk is correct at the end of the remount. Stash the current + * reserve pool size so that if we get remounted rw, we can + * return it to the same size. + */ + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + mp->m_flags |= XFS_MOUNT_RDONLY; + } + + return 0; +} + +/* + * Second stage of a freeze. The data is already frozen so we only + * need to take care of the metadata. Once that's done write a dummy + * record to dirty the log in case of a crash while frozen. + */ +STATIC int +xfs_fs_freeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_save_resvblks(mp); + xfs_quiesce_attr(mp); + return -xfs_fs_log_dummy(mp); +} + +STATIC int +xfs_fs_unfreeze( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); + return 0; +} + +STATIC int +xfs_fs_show_options( + struct seq_file *m, + struct dentry *root) +{ + return -xfs_showargs(XFS_M(root->d_sb), m); +} + +/* + * This function fills in xfs_mount_t fields based on mount args. + * Note: the superblock _has_ now been read in. + */ +STATIC int +xfs_finish_flags( + struct xfs_mount *mp) +{ + int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); + + /* Fail a mount where the logbuf is smaller than the log stripe */ + if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (mp->m_logbsize <= 0 && + mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { + mp->m_logbsize = mp->m_sb.sb_logsunit; + } else if (mp->m_logbsize > 0 && + mp->m_logbsize < mp->m_sb.sb_logsunit) { + xfs_warn(mp, + "logbuf size must be greater than or equal to log stripe size"); + return XFS_ERROR(EINVAL); + } + } else { + /* Fail a mount if the logbuf is larger than 32K */ + if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { + xfs_warn(mp, + "logbuf size for version 1 logs must be 16K or 32K"); + return XFS_ERROR(EINVAL); + } + } + + /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* + * mkfs'ed attr2 will turn on attr2 mount unless explicitly + * told by noattr2 to turn it off + */ + if (xfs_sb_version_hasattr2(&mp->m_sb) && + !(mp->m_flags & XFS_MOUNT_NOATTR2)) + mp->m_flags |= XFS_MOUNT_ATTR2; + + /* + * prohibit r/w mounts of read-only filesystems + */ + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + xfs_warn(mp, + "cannot mount a read-only filesystem as read-write"); + return XFS_ERROR(EROFS); + } + + if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && + (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && + !xfs_sb_version_has_pquotino(&mp->m_sb)) { + xfs_warn(mp, + "Super block does not support project and group quota together"); + return XFS_ERROR(EINVAL); + } + + return 0; +} + +STATIC int +xfs_fs_fill_super( + struct super_block *sb, + void *data, + int silent) +{ + struct inode *root; + struct xfs_mount *mp = NULL; + int flags = 0, error = ENOMEM; + + mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); + if (!mp) + goto out; + + spin_lock_init(&mp->m_sb_lock); + mutex_init(&mp->m_growlock); + atomic_set(&mp->m_active_trans, 0); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + + mp->m_super = sb; + sb->s_fs_info = mp; + + error = xfs_parseargs(mp, (char *)data); + if (error) + goto out_free_fsname; + + sb_min_blocksize(sb, BBSIZE); + sb->s_xattr = xfs_xattr_handlers; + sb->s_export_op = &xfs_export_operations; +#ifdef CONFIG_XFS_QUOTA + sb->s_qcop = &xfs_quotactl_operations; +#endif + sb->s_op = &xfs_super_operations; + + if (silent) + flags |= XFS_MFSI_QUIET; + + error = xfs_open_devices(mp); + if (error) + goto out_free_fsname; + + error = -xfs_init_mount_workqueues(mp); + if (error) + goto out_close_devices; + + error = -xfs_icsb_init_counters(mp); + if (error) + goto out_destroy_workqueues; + + error = xfs_readsb(mp, flags); + if (error) + goto out_destroy_counters; + + error = xfs_finish_flags(mp); + if (error) + goto out_free_sb; + + error = xfs_setup_devices(mp); + if (error) + goto out_free_sb; + + error = xfs_filestream_mount(mp); + if (error) + goto out_free_sb; + + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + */ + sb->s_magic = XFS_SB_MAGIC; + sb->s_blocksize = mp->m_sb.sb_blocksize; + sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; + sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits); + sb->s_max_links = XFS_MAXLINK; + sb->s_time_gran = 1; + set_posix_acl_flag(sb); + + /* version 5 superblocks support inode version counters. */ + if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + sb->s_flags |= MS_I_VERSION; + + error = xfs_mountfs(mp); + if (error) + goto out_filestream_unmount; + + root = igrab(VFS_I(mp->m_rootip)); + if (!root) { + error = ENOENT; + goto out_unmount; + } + sb->s_root = d_make_root(root); + if (!sb->s_root) { + error = ENOMEM; + goto out_unmount; + } + + return 0; + + out_filestream_unmount: + xfs_filestream_unmount(mp); + out_free_sb: + xfs_freesb(mp); + out_destroy_counters: + xfs_icsb_destroy_counters(mp); +out_destroy_workqueues: + xfs_destroy_mount_workqueues(mp); + out_close_devices: + xfs_close_devices(mp); + out_free_fsname: + xfs_free_fsname(mp); + kfree(mp); + out: + return -error; + + out_unmount: + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + goto out_free_sb; +} + +STATIC struct dentry * +xfs_fs_mount( + struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); +} + +static long +xfs_fs_nr_cached_objects( + struct super_block *sb, + int nid) +{ + return xfs_reclaim_inodes_count(XFS_M(sb)); +} + +static long +xfs_fs_free_cached_objects( + struct super_block *sb, + long nr_to_scan, + int nid) +{ + return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); +} + +static const struct super_operations xfs_super_operations = { + .alloc_inode = xfs_fs_alloc_inode, + .destroy_inode = xfs_fs_destroy_inode, + .evict_inode = xfs_fs_evict_inode, + .drop_inode = xfs_fs_drop_inode, + .put_super = xfs_fs_put_super, + .sync_fs = xfs_fs_sync_fs, + .freeze_fs = xfs_fs_freeze, + .unfreeze_fs = xfs_fs_unfreeze, + .statfs = xfs_fs_statfs, + .remount_fs = xfs_fs_remount, + .show_options = xfs_fs_show_options, + .nr_cached_objects = xfs_fs_nr_cached_objects, + .free_cached_objects = xfs_fs_free_cached_objects, +}; + +static struct file_system_type xfs_fs_type = { + .owner = THIS_MODULE, + .name = "xfs", + .mount = xfs_fs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("xfs"); + +STATIC int __init +xfs_init_zones(void) +{ + + xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend"); + if (!xfs_ioend_zone) + goto out; + + xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE, + xfs_ioend_zone); + if (!xfs_ioend_pool) + goto out_destroy_ioend_zone; + + xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t), + "xfs_log_ticket"); + if (!xfs_log_ticket_zone) + goto out_destroy_ioend_pool; + + xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), + "xfs_bmap_free_item"); + if (!xfs_bmap_free_item_zone) + goto out_destroy_log_ticket_zone; + + xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), + "xfs_btree_cur"); + if (!xfs_btree_cur_zone) + goto out_destroy_bmap_free_item_zone; + + xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), + "xfs_da_state"); + if (!xfs_da_state_zone) + goto out_destroy_btree_cur_zone; + + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + if (!xfs_ifork_zone) + goto out_destroy_da_state_zone; + + xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); + if (!xfs_trans_zone) + goto out_destroy_ifork_zone; + + xfs_log_item_desc_zone = + kmem_zone_init(sizeof(struct xfs_log_item_desc), + "xfs_log_item_desc"); + if (!xfs_log_item_desc_zone) + goto out_destroy_trans_zone; + + /* + * The size of the zone allocated buf log item is the maximum + * size possible under XFS. This wastes a little bit of memory, + * but it is much faster. + */ + xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), + "xfs_buf_item"); + if (!xfs_buf_item_zone) + goto out_destroy_log_item_desc_zone; + + xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); + if (!xfs_efd_zone) + goto out_destroy_buf_item_zone; + + xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); + if (!xfs_efi_zone) + goto out_destroy_efd_zone; + + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, + xfs_fs_inode_init_once); + if (!xfs_inode_zone) + goto out_destroy_efi_zone; + + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + if (!xfs_ili_zone) + goto out_destroy_inode_zone; + xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item), + "xfs_icr"); + if (!xfs_icreate_zone) + goto out_destroy_ili_zone; + + return 0; + + out_destroy_ili_zone: + kmem_zone_destroy(xfs_ili_zone); + out_destroy_inode_zone: + kmem_zone_destroy(xfs_inode_zone); + out_destroy_efi_zone: + kmem_zone_destroy(xfs_efi_zone); + out_destroy_efd_zone: + kmem_zone_destroy(xfs_efd_zone); + out_destroy_buf_item_zone: + kmem_zone_destroy(xfs_buf_item_zone); + out_destroy_log_item_desc_zone: + kmem_zone_destroy(xfs_log_item_desc_zone); + out_destroy_trans_zone: + kmem_zone_destroy(xfs_trans_zone); + out_destroy_ifork_zone: + kmem_zone_destroy(xfs_ifork_zone); + out_destroy_da_state_zone: + kmem_zone_destroy(xfs_da_state_zone); + out_destroy_btree_cur_zone: + kmem_zone_destroy(xfs_btree_cur_zone); + out_destroy_bmap_free_item_zone: + kmem_zone_destroy(xfs_bmap_free_item_zone); + out_destroy_log_ticket_zone: + kmem_zone_destroy(xfs_log_ticket_zone); + out_destroy_ioend_pool: + mempool_destroy(xfs_ioend_pool); + out_destroy_ioend_zone: + kmem_zone_destroy(xfs_ioend_zone); + out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_zones(void) +{ + /* + * Make sure all delayed rcu free are flushed before we + * destroy caches. + */ + rcu_barrier(); + kmem_zone_destroy(xfs_icreate_zone); + kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_inode_zone); + kmem_zone_destroy(xfs_efi_zone); + kmem_zone_destroy(xfs_efd_zone); + kmem_zone_destroy(xfs_buf_item_zone); + kmem_zone_destroy(xfs_log_item_desc_zone); + kmem_zone_destroy(xfs_trans_zone); + kmem_zone_destroy(xfs_ifork_zone); + kmem_zone_destroy(xfs_da_state_zone); + kmem_zone_destroy(xfs_btree_cur_zone); + kmem_zone_destroy(xfs_bmap_free_item_zone); + kmem_zone_destroy(xfs_log_ticket_zone); + mempool_destroy(xfs_ioend_pool); + kmem_zone_destroy(xfs_ioend_zone); + +} + +STATIC int __init +xfs_init_workqueues(void) +{ + /* + * The allocation workqueue can be used in memory reclaim situations + * (writepage path), and parallelism is only limited by the number of + * AGs in all the filesystems mounted. Hence use the default large + * max_active value for this workqueue. + */ + xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); + if (!xfs_alloc_wq) + return -ENOMEM; + + return 0; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_alloc_wq); +} + +STATIC int __init +init_xfs_fs(void) +{ + int error; + + printk(KERN_INFO XFS_VERSION_STRING " with " + XFS_BUILD_OPTIONS " enabled\n"); + + xfs_dir_startup(); + + error = xfs_init_zones(); + if (error) + goto out; + + error = xfs_init_workqueues(); + if (error) + goto out_destroy_zones; + + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + + error = xfs_buf_init(); + if (error) + goto out_mru_cache_uninit; + + error = xfs_init_procfs(); + if (error) + goto out_buf_terminate; + + error = xfs_sysctl_register(); + if (error) + goto out_cleanup_procfs; + + error = xfs_qm_init(); + if (error) + goto out_sysctl_unregister; + + error = register_filesystem(&xfs_fs_type); + if (error) + goto out_qm_exit; + return 0; + + out_qm_exit: + xfs_qm_exit(); + out_sysctl_unregister: + xfs_sysctl_unregister(); + out_cleanup_procfs: + xfs_cleanup_procfs(); + out_buf_terminate: + xfs_buf_terminate(); + out_mru_cache_uninit: + xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); + out_destroy_zones: + xfs_destroy_zones(); + out: + return error; +} + +STATIC void __exit +exit_xfs_fs(void) +{ + xfs_qm_exit(); + unregister_filesystem(&xfs_fs_type); + xfs_sysctl_unregister(); + xfs_cleanup_procfs(); + xfs_buf_terminate(); + xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); + xfs_destroy_zones(); +} + +module_init(init_xfs_fs); +module_exit(exit_xfs_fs); + +MODULE_AUTHOR("Silicon Graphics, Inc."); +MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled"); +MODULE_LICENSE("GPL"); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/xfs_super.h index 33dd1ca1324..bbe3d15a790 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -18,26 +18,14 @@ #ifndef __XFS_SUPER_H__ #define __XFS_SUPER_H__ -#ifdef CONFIG_XFS_DMAPI -# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops) -# define vfs_initdmapi() dmapi_init() -# define vfs_exitdmapi() dmapi_uninit() -#else -# define vfs_insertdmapi(vfs) do { } while (0) -# define vfs_initdmapi() do { } while (0) -# define vfs_exitdmapi() do { } while (0) -#endif +#include <linux/exportfs.h> #ifdef CONFIG_XFS_QUOTA -# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops) -extern void xfs_qm_init(void); +extern int xfs_qm_init(void); extern void xfs_qm_exit(void); -# define vfs_initquota() xfs_qm_init() -# define vfs_exitquota() xfs_qm_exit() #else -# define vfs_insertquota(vfs) do { } while (0) -# define vfs_initquota() do { } while (0) -# define vfs_exitquota() do { } while (0) +# define xfs_qm_init() (0) +# define xfs_qm_exit() do { } while (0) #endif #ifdef CONFIG_XFS_POSIX_ACL @@ -48,13 +36,7 @@ extern void xfs_qm_exit(void); # define set_posix_acl_flag(sb) do { } while (0) #endif -#ifdef CONFIG_XFS_SECURITY -# define XFS_SECURITY_STRING "security attributes, " -# define ENOSECURITY 0 -#else -# define XFS_SECURITY_STRING -# define ENOSECURITY EOPNOTSUPP -#endif +#define XFS_SECURITY_STRING "security attributes, " #ifdef CONFIG_XFS_RT # define XFS_REALTIME_STRING "realtime, " @@ -72,30 +54,17 @@ extern void xfs_qm_exit(void); # define XFS_BIGFS_STRING #endif -#ifdef CONFIG_XFS_TRACE -# define XFS_TRACE_STRING "tracing, " -#else -# define XFS_TRACE_STRING -#endif - -#ifdef CONFIG_XFS_DMAPI -# define XFS_DMAPI_STRING "dmapi support, " -#else -# define XFS_DMAPI_STRING -#endif - #ifdef DEBUG # define XFS_DBG_STRING "debug" #else # define XFS_DBG_STRING "no debug" #endif +#define XFS_VERSION_STRING "SGI XFS" #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ XFS_BIGFS_STRING \ - XFS_TRACE_STRING \ - XFS_DMAPI_STRING \ XFS_DBG_STRING /* DBG must be last */ struct xfs_inode; @@ -105,16 +74,15 @@ struct block_device; extern __uint64_t xfs_max_file_offset(unsigned int); -extern void xfs_initialize_vnode(bhv_desc_t *, bhv_vnode_t *, bhv_desc_t *, int); - -extern void xfs_flush_inode(struct xfs_inode *); -extern void xfs_flush_device(struct xfs_inode *); - -extern int xfs_blkdev_get(struct xfs_mount *, const char *, - struct block_device **); -extern void xfs_blkdev_put(struct block_device *); +extern void xfs_flush_inodes(struct xfs_mount *mp); extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); +extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); +extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); + +extern const struct export_operations xfs_export_operations; +extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct quotactl_ops xfs_quotactl_operations; -extern struct export_operations xfs_export_operations; +#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c new file mode 100644 index 00000000000..d69363c833e --- /dev/null +++ b/fs/xfs/xfs_symlink.c @@ -0,0 +1,599 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_bit.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_dir2.h" +#include "xfs_inode.h" +#include "xfs_ialloc.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" +#include "xfs_error.h" +#include "xfs_quota.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_dinode.h" + +/* ----- Kernel only functions below ----- */ +STATIC int +xfs_readlink_bmap( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + struct xfs_buf *bp; + xfs_daddr_t d; + char *cur_chunk; + int pathlen = ip->i_d.di_size; + int nmaps = XFS_SYMLINK_MAPS; + int byte_cnt; + int n; + int error = 0; + int fsblocks = 0; + int offset; + + fsblocks = xfs_symlink_blocks(mp, pathlen); + error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0); + if (error) + goto out; + + offset = 0; + for (n = 0; n < nmaps; n++) { + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + + bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, + &xfs_symlink_buf_ops); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; + goto out; + } + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + if (pathlen < byte_cnt) + byte_cnt = pathlen; + + cur_chunk = bp->b_addr; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_symlink_hdr_ok(ip->i_ino, offset, + byte_cnt, bp)) { + error = EFSCORRUPTED; + xfs_alert(mp, +"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", + offset, byte_cnt, ip->i_ino); + xfs_buf_relse(bp); + goto out; + + } + + cur_chunk += sizeof(struct xfs_dsymlink_hdr); + } + + memcpy(link + offset, bp->b_addr, byte_cnt); + + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_buf_relse(bp); + } + ASSERT(pathlen == 0); + + link[ip->i_d.di_size] = '\0'; + error = 0; + + out: + return error; +} + +int +xfs_readlink( + struct xfs_inode *ip, + char *link) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t pathlen; + int error = 0; + + trace_xfs_readlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_SHARED); + + pathlen = ip->i_d.di_size; + if (!pathlen) + goto out; + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)", + __func__, (unsigned long long) ip->i_ino, + (long long) pathlen); + ASSERT(0); + error = XFS_ERROR(EFSCORRUPTED); + goto out; + } + + + if (ip->i_df.if_flags & XFS_IFINLINE) { + memcpy(link, ip->i_df.if_u1.if_data, pathlen); + link[pathlen] = '\0'; + } else { + error = xfs_readlink_bmap(ip, link); + } + + out: + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return error; +} + +int +xfs_symlink( + struct xfs_inode *dp, + struct xfs_name *link_name, + const char *target_path, + umode_t mode, + struct xfs_inode **ipp) +{ + struct xfs_mount *mp = dp->i_mount; + struct xfs_trans *tp = NULL; + struct xfs_inode *ip = NULL; + int error = 0; + int pathlen; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + bool unlock_dp_on_error = false; + uint cancel_flags; + int committed; + xfs_fileoff_t first_fsb; + xfs_filblks_t fs_blocks; + int nmaps; + struct xfs_bmbt_irec mval[XFS_SYMLINK_MAPS]; + xfs_daddr_t d; + const char *cur_chunk; + int byte_cnt; + int n; + xfs_buf_t *bp; + prid_t prid; + struct xfs_dquot *udqp = NULL; + struct xfs_dquot *gdqp = NULL; + struct xfs_dquot *pdqp = NULL; + uint resblks; + + *ipp = NULL; + + trace_xfs_symlink(dp, link_name); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + /* + * Check component lengths of the target path name. + */ + pathlen = strlen(target_path); + if (pathlen >= MAXPATHLEN) /* total string too long */ + return XFS_ERROR(ENAMETOOLONG); + + udqp = gdqp = NULL; + prid = xfs_get_initial_prid(dp); + + /* + * Make sure that we have allocated dquot(s) on disk. + */ + error = xfs_qm_vop_dqalloc(dp, + xfs_kuid_to_uid(current_fsuid()), + xfs_kgid_to_gid(current_fsgid()), prid, + XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, + &udqp, &gdqp, &pdqp); + if (error) + goto std_return; + + tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); + cancel_flags = XFS_TRANS_RELEASE_LOG_RES; + /* + * The symlink will fit into the inode data fork? + * There can't be any attributes so we get the whole variable part. + */ + if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) + fs_blocks = 0; + else + fs_blocks = xfs_symlink_blocks(mp, pathlen); + resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); + if (error == ENOSPC && fs_blocks == 0) { + resblks = 0; + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); + } + if (error) { + cancel_flags = 0; + goto error_return; + } + + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); + unlock_dp_on_error = true; + + /* + * Check whether the directory allows new symlinks or not. + */ + if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { + error = XFS_ERROR(EPERM); + goto error_return; + } + + /* + * Reserve disk quota : blocks and inode. + */ + error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, + pdqp, resblks, 1, 0); + if (error) + goto error_return; + + /* + * Check for ability to enter directory entry, if no space reserved. + */ + error = xfs_dir_canenter(tp, dp, link_name, resblks); + if (error) + goto error_return; + /* + * Initialize the bmap freelist prior to calling either + * bmapi or the directory create code. + */ + xfs_bmap_init(&free_list, &first_block); + + /* + * Allocate an inode for the symlink. + */ + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, + prid, resblks > 0, &ip, NULL); + if (error) { + if (error == ENOSPC) + goto error_return; + goto error1; + } + + /* + * An error after we've joined dp to the transaction will result in the + * transaction cancel unlocking dp so don't do it explicitly in the + * error path. + */ + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); + unlock_dp_on_error = false; + + /* + * Also attach the dquot(s) to it, if applicable. + */ + xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); + + if (resblks) + resblks -= XFS_IALLOC_SPACE_RES(mp); + /* + * If the symlink will fit into the inode, write it inline. + */ + if (pathlen <= XFS_IFORK_DSIZE(ip)) { + xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); + memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); + ip->i_d.di_size = pathlen; + + /* + * The inode was initially created in extent format. + */ + ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); + ip->i_df.if_flags |= XFS_IFINLINE; + + ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; + xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); + + } else { + int offset; + + first_fsb = 0; + nmaps = XFS_SYMLINK_MAPS; + + error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks, + XFS_BMAPI_METADATA, &first_block, resblks, + mval, &nmaps, &free_list); + if (error) + goto error2; + + if (resblks) + resblks -= fs_blocks; + ip->i_d.di_size = pathlen; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + cur_chunk = target_path; + offset = 0; + for (n = 0; n < nmaps; n++) { + char *buf; + + d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); + byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, + BTOBB(byte_cnt), 0); + if (!bp) { + error = ENOMEM; + goto error2; + } + bp->b_ops = &xfs_symlink_buf_ops; + + byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); + byte_cnt = min(byte_cnt, pathlen); + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, + byte_cnt, bp); + + memcpy(buf, cur_chunk, byte_cnt); + + cur_chunk += byte_cnt; + pathlen -= byte_cnt; + offset += byte_cnt; + + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); + xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - + (char *)bp->b_addr); + } + ASSERT(pathlen == 0); + } + + /* + * Create the directory entry for the symlink. + */ + error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, + &first_block, &free_list, resblks); + if (error) + goto error2; + xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); + + /* + * If this is a synchronous mount, make sure that the + * symlink transaction goes to disk before returning to + * the user. + */ + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + xfs_trans_set_sync(tp); + } + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) { + goto error2; + } + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + *ipp = ip; + return 0; + + error2: + IRELE(ip); + error1: + xfs_bmap_cancel(&free_list); + cancel_flags |= XFS_TRANS_ABORT; + error_return: + xfs_trans_cancel(tp, cancel_flags); + xfs_qm_dqrele(udqp); + xfs_qm_dqrele(gdqp); + xfs_qm_dqrele(pdqp); + + if (unlock_dp_on_error) + xfs_iunlock(dp, XFS_ILOCK_EXCL); + std_return: + return error; +} + +/* + * Free a symlink that has blocks associated with it. + */ +STATIC int +xfs_inactive_symlink_rmt( + struct xfs_inode *ip) +{ + xfs_buf_t *bp; + int committed; + int done; + int error; + xfs_fsblock_t first_block; + xfs_bmap_free_t free_list; + int i; + xfs_mount_t *mp; + xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS]; + int nmaps; + int size; + xfs_trans_t *tp; + + mp = ip->i_mount; + ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS); + /* + * We're freeing a symlink that has some + * blocks allocated to it. Free the + * blocks here. We know that we've got + * either 1 or 2 extents and that we can + * free them all in one bunmapi call. + */ + ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); + + tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + /* + * Lock the inode, fix the size, and join it to the transaction. + * Hold it so in the normal path, we still have it locked for + * the second transaction. In the error paths we need it + * held so the cancel won't rele it, see below. + */ + size = (int)ip->i_d.di_size; + ip->i_d.di_size = 0; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Find the block(s) so we can inval and unmap them. + */ + done = 0; + xfs_bmap_init(&free_list, &first_block); + nmaps = ARRAY_SIZE(mval); + error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size), + mval, &nmaps, 0); + if (error) + goto error_trans_cancel; + /* + * Invalidate the block(s). No validation is done. + */ + for (i = 0; i < nmaps; i++) { + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), + XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); + if (!bp) { + error = ENOMEM; + goto error_bmap_cancel; + } + xfs_trans_binval(tp, bp); + } + /* + * Unmap the dead block(s) to the free_list. + */ + error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, + &first_block, &free_list, &done); + if (error) + goto error_bmap_cancel; + ASSERT(done); + /* + * Commit the first transaction. This logs the EFI and the inode. + */ + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto error_bmap_cancel; + /* + * The transaction must have been committed, since there were + * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. + * The new tp has the extent freeing and EFDs. + */ + ASSERT(committed); + /* + * The first xact was committed, so add the inode to the new one. + * Mark it dirty so it will be logged and moved forward in the log as + * part of every commit. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* + * Commit the transaction containing extent freeing and EFDs. + */ + error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); + goto error_unlock; + } + + /* + * Remove the memory for extent descriptions (just bookkeeping). + */ + if (ip->i_df.if_bytes) + xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); + ASSERT(ip->i_df.if_bytes == 0); + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + +error_bmap_cancel: + xfs_bmap_cancel(&free_list); +error_trans_cancel: + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); +error_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* + * xfs_inactive_symlink - free a symlink + */ +int +xfs_inactive_symlink( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + int pathlen; + + trace_xfs_inactive_symlink(ip); + + if (XFS_FORCED_SHUTDOWN(mp)) + return XFS_ERROR(EIO); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + /* + * Zero length symlinks _can_ exist. + */ + pathlen = (int)ip->i_d.di_size; + if (!pathlen) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return 0; + } + + if (pathlen < 0 || pathlen > MAXPATHLEN) { + xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)", + __func__, (unsigned long long)ip->i_ino, pathlen); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(0); + return XFS_ERROR(EFSCORRUPTED); + } + + if (ip->i_df.if_flags & XFS_IFINLINE) { + if (ip->i_df.if_bytes > 0) + xfs_idata_realloc(ip, -(ip->i_df.if_bytes), + XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + ASSERT(ip->i_df.if_bytes == 0); + return 0; + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + /* remove the remote symlink */ + return xfs_inactive_symlink_rmt(ip); +} diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/xfs_symlink.h index f8d279d7563..e75245d0911 100644 --- a/fs/xfs/linux-2.6/xfs_version.h +++ b/fs/xfs/xfs_symlink.h @@ -1,6 +1,5 @@ /* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. + * Copyright (c) 2012 Red Hat, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -15,15 +14,14 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef __XFS_VERSION_H__ -#define __XFS_VERSION_H__ +#ifndef __XFS_SYMLINK_H +#define __XFS_SYMLINK_H 1 -/* - * Dummy file that can contain a timestamp to put into the - * XFS init string, to help users keep track of what they're - * running - */ +/* Kernel only symlink defintions */ -#define XFS_VERSION_STRING "SGI XFS" +int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, + const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_readlink(struct xfs_inode *ip, char *link); +int xfs_inactive_symlink(struct xfs_inode *ip); -#endif /* __XFS_VERSION_H__ */ +#endif /* __XFS_SYMLINK_H */ diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c new file mode 100644 index 00000000000..23c2f2577c8 --- /dev/null +++ b/fs/xfs/xfs_symlink_remote.c @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2000-2006 Silicon Graphics, Inc. + * Copyright (c) 2012-2013 Red Hat, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_shared.h" +#include "xfs_trans_resv.h" +#include "xfs_ag.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_bmap_btree.h" +#include "xfs_inode.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_symlink.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_buf_item.h" + + +/* + * Each contiguous block has a header, so it is not just a simple pathlen + * to FSB conversion. + */ +int +xfs_symlink_blocks( + struct xfs_mount *mp, + int pathlen) +{ + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + + return (pathlen + buflen - 1) / buflen; +} + +int +xfs_symlink_hdr_set( + struct xfs_mount *mp, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return 0; + + dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); + dsl->sl_offset = cpu_to_be32(offset); + dsl->sl_bytes = cpu_to_be32(size); + uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_uuid); + dsl->sl_owner = cpu_to_be64(ino); + dsl->sl_blkno = cpu_to_be64(bp->b_bn); + bp->b_ops = &xfs_symlink_buf_ops; + + return sizeof(struct xfs_dsymlink_hdr); +} + +/* + * Checking of the symlink header is split into two parts. the verifier does + * CRC, location and bounds checking, the unpacking function checks the path + * parameters and owner. + */ +bool +xfs_symlink_hdr_ok( + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + struct xfs_buf *bp) +{ + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (offset != be32_to_cpu(dsl->sl_offset)) + return false; + if (size != be32_to_cpu(dsl->sl_bytes)) + return false; + if (ino != be64_to_cpu(dsl->sl_owner)) + return false; + + /* ok */ + return true; +} + +static bool +xfs_symlink_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (dsl->sl_magic != cpu_to_be32(XFS_SYMLINK_MAGIC)) + return false; + if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_uuid)) + return false; + if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + return false; + if (be32_to_cpu(dsl->sl_offset) + + be32_to_cpu(dsl->sl_bytes) >= MAXPATHLEN) + return false; + if (dsl->sl_owner == 0) + return false; + + return true; +} + +static void +xfs_symlink_read_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) + xfs_buf_ioerror(bp, EFSBADCRC); + else if (!xfs_symlink_verify(bp)) + xfs_buf_ioerror(bp, EFSCORRUPTED); + + if (bp->b_error) + xfs_verifier_error(bp); +} + +static void +xfs_symlink_write_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_buf_log_item *bip = bp->b_fspriv; + + /* no verification of non-crc buffers */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return; + + if (!xfs_symlink_verify(bp)) { + xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_verifier_error(bp); + return; + } + + if (bip) { + struct xfs_dsymlink_hdr *dsl = bp->b_addr; + dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF); +} + +const struct xfs_buf_ops xfs_symlink_buf_ops = { + .verify_read = xfs_symlink_read_verify, + .verify_write = xfs_symlink_write_verify, +}; + +void +xfs_symlink_local_to_remote( + struct xfs_trans *tp, + struct xfs_buf *bp, + struct xfs_inode *ip, + struct xfs_ifork *ifp) +{ + struct xfs_mount *mp = ip->i_mount; + char *buf; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) { + bp->b_ops = NULL; + memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + return; + } + + /* + * As this symlink fits in an inode literal area, it must also fit in + * the smallest buffer the filesystem supports. + */ + ASSERT(BBTOB(bp->b_length) >= + ifp->if_bytes + sizeof(struct xfs_dsymlink_hdr)); + + bp->b_ops = &xfs_symlink_buf_ops; + + buf = bp->b_addr; + buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); + memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); +} diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c new file mode 100644 index 00000000000..1743b9f8e23 --- /dev/null +++ b/fs/xfs/xfs_sysctl.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2001-2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include "xfs_error.h" + +static struct ctl_table_header *xfs_table_header; + +#ifdef CONFIG_PROC_FS +STATIC int +xfs_stats_clear_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int c, ret, *valp = ctl->data; + __uint32_t vn_active; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + + if (!ret && write && *valp) { + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, + sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } + xfs_stats_clear = 0; + } + + return ret; +} + +STATIC int +xfs_panic_mask_proc_handler( + struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int ret, *valp = ctl->data; + + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); + if (!ret && write) { + xfs_panic_mask = *valp; +#ifdef DEBUG + xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES); +#endif + } + return ret; +} +#endif /* CONFIG_PROC_FS */ + +static struct ctl_table xfs_table[] = { + { + .procname = "irix_sgid_inherit", + .data = &xfs_params.sgid_inherit.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.sgid_inherit.min, + .extra2 = &xfs_params.sgid_inherit.max + }, + { + .procname = "irix_symlink_mode", + .data = &xfs_params.symlink_mode.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.symlink_mode.min, + .extra2 = &xfs_params.symlink_mode.max + }, + { + .procname = "panic_mask", + .data = &xfs_params.panic_mask.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_panic_mask_proc_handler, + .extra1 = &xfs_params.panic_mask.min, + .extra2 = &xfs_params.panic_mask.max + }, + + { + .procname = "error_level", + .data = &xfs_params.error_level.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.error_level.min, + .extra2 = &xfs_params.error_level.max + }, + { + .procname = "xfssyncd_centisecs", + .data = &xfs_params.syncd_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.syncd_timer.min, + .extra2 = &xfs_params.syncd_timer.max + }, + { + .procname = "inherit_sync", + .data = &xfs_params.inherit_sync.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_sync.min, + .extra2 = &xfs_params.inherit_sync.max + }, + { + .procname = "inherit_nodump", + .data = &xfs_params.inherit_nodump.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodump.min, + .extra2 = &xfs_params.inherit_nodump.max + }, + { + .procname = "inherit_noatime", + .data = &xfs_params.inherit_noatim.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_noatim.min, + .extra2 = &xfs_params.inherit_noatim.max + }, + { + .procname = "xfsbufd_centisecs", + .data = &xfs_params.xfs_buf_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.xfs_buf_timer.min, + .extra2 = &xfs_params.xfs_buf_timer.max + }, + { + .procname = "age_buffer_centisecs", + .data = &xfs_params.xfs_buf_age.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.xfs_buf_age.min, + .extra2 = &xfs_params.xfs_buf_age.max + }, + { + .procname = "inherit_nosymlinks", + .data = &xfs_params.inherit_nosym.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nosym.min, + .extra2 = &xfs_params.inherit_nosym.max + }, + { + .procname = "rotorstep", + .data = &xfs_params.rotorstep.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.rotorstep.min, + .extra2 = &xfs_params.rotorstep.max + }, + { + .procname = "inherit_nodefrag", + .data = &xfs_params.inherit_nodfrg.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.inherit_nodfrg.min, + .extra2 = &xfs_params.inherit_nodfrg.max + }, + { + .procname = "filestream_centisecs", + .data = &xfs_params.fstrm_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.fstrm_timer.min, + .extra2 = &xfs_params.fstrm_timer.max, + }, + { + .procname = "speculative_prealloc_lifetime", + .data = &xfs_params.eofb_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &xfs_params.eofb_timer.min, + .extra2 = &xfs_params.eofb_timer.max, + }, + /* please keep this the last entry */ +#ifdef CONFIG_PROC_FS + { + .procname = "stats_clear", + .data = &xfs_params.stats_clear.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_stats_clear_proc_handler, + .extra1 = &xfs_params.stats_clear.min, + .extra2 = &xfs_params.stats_clear.max + }, +#endif /* CONFIG_PROC_FS */ + + {} +}; + +static struct ctl_table xfs_dir_table[] = { + { + .procname = "xfs", + .mode = 0555, + .child = xfs_table + }, + {} +}; + +static struct ctl_table xfs_root_table[] = { + { + .procname = "fs", + .mode = 0555, + .child = xfs_dir_table + }, + {} +}; + +int +xfs_sysctl_register(void) +{ + xfs_table_header = register_sysctl_table(xfs_root_table); + if (!xfs_table_header) + return -ENOMEM; + return 0; +} + +void +xfs_sysctl_unregister(void) +{ + unregister_sysctl_table(xfs_table_header); +} diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index a631fb8cc5a..bd8e157c20e 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val { } xfs_sysctl_val_t; typedef struct xfs_param { - xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/ xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is * not a member of parent dir GID. */ xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ @@ -47,6 +46,8 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ + xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */ + xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */ } xfs_param_t; /* @@ -67,7 +68,7 @@ typedef struct xfs_param { enum { /* XFS_REFCACHE_SIZE = 1 */ /* XFS_REFCACHE_PURGE = 2 */ - XFS_RESTRICT_CHOWN = 3, + /* XFS_RESTRICT_CHOWN = 3 */ XFS_SGID_INHERIT = 4, XFS_SYMLINK_MODE = 5, XFS_PANIC_MASK = 6, @@ -86,15 +87,16 @@ enum { XFS_INHERIT_NOSYM = 19, XFS_ROTORSTEP = 20, XFS_INHERIT_NODFRG = 21, + XFS_FILESTREAM_TIMER = 22, }; extern xfs_param_t xfs_params; #ifdef CONFIG_SYSCTL -extern void xfs_sysctl_register(void); +extern int xfs_sysctl_register(void); extern void xfs_sysctl_unregister(void); #else -# define xfs_sysctl_register() do { } while (0) +# define xfs_sysctl_register() (0) # define xfs_sysctl_unregister() do { } while (0) #endif /* CONFIG_SYSCTL */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c new file mode 100644 index 00000000000..1e85bcd0e41 --- /dev/null +++ b/fs/xfs/xfs_trace.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_da_btree.h" +#include "xfs_ialloc.h" +#include "xfs_itable.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_trans.h" +#include "xfs_log.h" +#include "xfs_log_priv.h" +#include "xfs_buf_item.h" +#include "xfs_quota.h" +#include "xfs_iomap.h" +#include "xfs_aops.h" +#include "xfs_dquot_item.h" +#include "xfs_dquot.h" +#include "xfs_log_recover.h" +#include "xfs_inode_item.h" +#include "xfs_bmap_btree.h" +#include "xfs_filestream.h" + +/* + * We include this last to have the helpers above available for the trace + * event implementations. + */ +#define CREATE_TRACE_POINTS +#include "xfs_trace.h" diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h new file mode 100644 index 00000000000..152f8278263 --- /dev/null +++ b/fs/xfs/xfs_trace.h @@ -0,0 +1,2073 @@ +/* + * Copyright (c) 2009, Christoph Hellwig + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM xfs + +#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_XFS_H + +#include <linux/tracepoint.h> + +struct xfs_agf; +struct xfs_alloc_arg; +struct xfs_attr_list_context; +struct xfs_buf_log_item; +struct xfs_da_args; +struct xfs_da_node_entry; +struct xfs_dquot; +struct xfs_log_item; +struct xlog; +struct xlog_ticket; +struct xlog_recover; +struct xlog_recover_item; +struct xfs_buf_log_format; +struct xfs_inode_log_format; +struct xfs_bmbt_irec; + +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + +#define DEFINE_ATTR_LIST_EVENT(name) \ +DEFINE_EVENT(xfs_attr_list_class, name, \ + TP_PROTO(struct xfs_attr_list_context *ctx), \ + TP_ARGS(ctx)) +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); +DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); +DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); +DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); + +DECLARE_EVENT_CLASS(xfs_perag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, + unsigned long caller_ip), + TP_ARGS(mp, agno, refcount, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, refcount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->refcount = refcount; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u refcount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->refcount, + (char *)__entry->caller_ip) +); + +#define DEFINE_PERAG_REF_EVENT(name) \ +DEFINE_EVENT(xfs_perag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \ + unsigned long caller_ip), \ + TP_ARGS(mp, agno, refcount, caller_ip)) +DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); +DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks); + +DECLARE_EVENT_CLASS(xfs_ag_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), + TP_ARGS(mp, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + ), + TP_printk("dev %d:%d agno %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno) +); +#define DEFINE_AG_EVENT(name) \ +DEFINE_EVENT(xfs_ag_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \ + TP_ARGS(mp, agno)) + +DEFINE_AG_EVENT(xfs_read_agf); +DEFINE_AG_EVENT(xfs_alloc_read_agf); +DEFINE_AG_EVENT(xfs_read_agi); +DEFINE_AG_EVENT(xfs_ialloc_read_agi); + +TRACE_EVENT(xfs_attr_list_node_descend, + TP_PROTO(struct xfs_attr_list_context *ctx, + struct xfs_da_node_entry *btree), + TP_ARGS(ctx, btree), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + __field(u32, bt_hashval) + __field(u32, bt_before) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + __entry->bt_hashval = be32_to_cpu(btree->hashval); + __entry->bt_before = be32_to_cpu(btree->before); + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s " + "node hashval %u, node before %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), + __entry->bt_hashval, + __entry->bt_before) +); + +TRACE_EVENT(xfs_iext_insert, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, + struct xfs_bmbt_irec *r, int state, unsigned long caller_ip), + TP_ARGS(ip, idx, r, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r->br_startoff; + __entry->startblock = r->br_startblock; + __entry->blockcount = r->br_blockcount; + __entry->state = r->br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %lld count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + +#define DEFINE_BMAP_EVENT(name) \ +DEFINE_EVENT(xfs_bmap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ + unsigned long caller_ip), \ + TP_ARGS(ip, idx, state, caller_ip)) +DEFINE_BMAP_EVENT(xfs_iext_remove); +DEFINE_BMAP_EVENT(xfs_bmap_pre_update); +DEFINE_BMAP_EVENT(xfs_bmap_post_update); +DEFINE_BMAP_EVENT(xfs_extlist); + +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(int, nblks) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->nblks = bp->b_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->nblks, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) +DEFINE_BUF_EVENT(xfs_buf_init); +DEFINE_BUF_EVENT(xfs_buf_free); +DEFINE_BUF_EVENT(xfs_buf_hold); +DEFINE_BUF_EVENT(xfs_buf_rele); +DEFINE_BUF_EVENT(xfs_buf_iodone); +DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_bawrite); +DEFINE_BUF_EVENT(xfs_buf_lock); +DEFINE_BUF_EVENT(xfs_buf_lock_done); +DEFINE_BUF_EVENT(xfs_buf_trylock); +DEFINE_BUF_EVENT(xfs_buf_unlock); +DEFINE_BUF_EVENT(xfs_buf_iowait); +DEFINE_BUF_EVENT(xfs_buf_iowait_done); +DEFINE_BUF_EVENT(xfs_buf_delwri_queue); +DEFINE_BUF_EVENT(xfs_buf_delwri_queued); +DEFINE_BUF_EVENT(xfs_buf_delwri_split); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); +DEFINE_BUF_EVENT(xfs_bdstrat_shut); +DEFINE_BUF_EVENT(xfs_buf_item_relse); +DEFINE_BUF_EVENT(xfs_buf_item_iodone); +DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); +DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); +DEFINE_BUF_EVENT(xfs_trans_read_buf_io); +DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); + +/* not really buffer traces, but the buf provides useful information */ +DEFINE_BUF_EVENT(xfs_btree_corrupt); +DEFINE_BUF_EVENT(xfs_da_btree_corrupt); +DEFINE_BUF_EVENT(xfs_reset_dqcounts); +DEFINE_BUF_EVENT(xfs_inode_item_push); + +/* pass flags explicitly */ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) +DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); + +TRACE_EVENT(xfs_buf_ioerror, + TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip), + TP_ARGS(bp, error, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(unsigned, flags) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(int, error) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = BBTOB(bp->b_length); + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = bp->b_sema.count; + __entry->error = error; + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d error %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __entry->error, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) +); + +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = BBTOB(bip->bli_buf->b_length); + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = bip->bli_buf->b_sema.count; + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); +DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); +DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur); +DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf); +DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); +DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); +DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); + +DECLARE_EVENT_CLASS(xfs_filestream_class, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), + TP_ARGS(ip, agno), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams) +) +#define DEFINE_FILESTREAM_EVENT(name) \ +DEFINE_EVENT(xfs_filestream_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \ + TP_ARGS(ip, agno)) +DEFINE_FILESTREAM_EVENT(xfs_filestream_free); +DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup); +DEFINE_FILESTREAM_EVENT(xfs_filestream_scan); + +TRACE_EVENT(xfs_filestream_pick, + TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno, + xfs_extlen_t free, int nscan), + TP_ARGS(ip, agno, free, nscan), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_agnumber_t, agno) + __field(int, streams) + __field(xfs_extlen_t, free) + __field(int, nscan) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->agno = agno; + __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno); + __entry->free = free; + __entry->nscan = nscan; + ), + TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->agno, + __entry->streams, + __entry->free, + __entry->nscan) +); + +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + +#define DEFINE_LOCK_EVENT(name) \ +DEFINE_EVENT(xfs_lock_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ + unsigned long caller_ip), \ + TP_ARGS(ip, lock_flags, caller_ip)) +DEFINE_LOCK_EVENT(xfs_ilock); +DEFINE_LOCK_EVENT(xfs_ilock_nowait); +DEFINE_LOCK_EVENT(xfs_ilock_demote); +DEFINE_LOCK_EVENT(xfs_iunlock); + +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + +#define DEFINE_INODE_EVENT(name) \ +DEFINE_EVENT(xfs_inode_class, name, \ + TP_PROTO(struct xfs_inode *ip), \ + TP_ARGS(ip)) +DEFINE_INODE_EVENT(xfs_iget_skip); +DEFINE_INODE_EVENT(xfs_iget_reclaim); +DEFINE_INODE_EVENT(xfs_iget_reclaim_fail); +DEFINE_INODE_EVENT(xfs_iget_hit); +DEFINE_INODE_EVENT(xfs_iget_miss); + +DEFINE_INODE_EVENT(xfs_getattr); +DEFINE_INODE_EVENT(xfs_setattr); +DEFINE_INODE_EVENT(xfs_readlink); +DEFINE_INODE_EVENT(xfs_inactive_symlink); +DEFINE_INODE_EVENT(xfs_alloc_file_space); +DEFINE_INODE_EVENT(xfs_free_file_space); +DEFINE_INODE_EVENT(xfs_zero_file_space); +DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_readdir); +#ifdef CONFIG_XFS_POSIX_ACL +DEFINE_INODE_EVENT(xfs_get_acl); +#endif +DEFINE_INODE_EVENT(xfs_vm_bmap); +DEFINE_INODE_EVENT(xfs_file_ioctl); +DEFINE_INODE_EVENT(xfs_file_compat_ioctl); +DEFINE_INODE_EVENT(xfs_ioctl_setattr); +DEFINE_INODE_EVENT(xfs_dir_fsync); +DEFINE_INODE_EVENT(xfs_file_fsync); +DEFINE_INODE_EVENT(xfs_destroy_inode); +DEFINE_INODE_EVENT(xfs_evict_inode); +DEFINE_INODE_EVENT(xfs_update_time); + +DEFINE_INODE_EVENT(xfs_dquot_dqalloc); +DEFINE_INODE_EVENT(xfs_dquot_dqdetach); + +DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); +DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); + +DECLARE_EVENT_CLASS(xfs_iref_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(int, pincount) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->pincount = atomic_read(&ip->i_pincount); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + __entry->pincount, + (char *)__entry->caller_ip) +) + +TRACE_EVENT(xfs_iomap_prealloc_size, + TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t blocks, int shift, + unsigned int writeio_blocks), + TP_ARGS(ip, blocks, shift, writeio_blocks), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsblock_t, blocks) + __field(int, shift) + __field(unsigned int, writeio_blocks) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->blocks = blocks; + __entry->shift = shift; + __entry->writeio_blocks = writeio_blocks; + ), + TP_printk("dev %d:%d ino 0x%llx prealloc blocks %llu shift %d " + "m_writeio_blocks %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, + __entry->blocks, __entry->shift, __entry->writeio_blocks) +) + +#define DEFINE_IREF_EVENT(name) \ +DEFINE_EVENT(xfs_iref_class, name, \ + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ + TP_ARGS(ip, caller_ip)) +DEFINE_IREF_EVENT(xfs_ihold); +DEFINE_IREF_EVENT(xfs_irele); +DEFINE_IREF_EVENT(xfs_inode_pin); +DEFINE_IREF_EVENT(xfs_inode_unpin); +DEFINE_IREF_EVENT(xfs_inode_unpin_nowait); + +DECLARE_EVENT_CLASS(xfs_namespace_class, + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), + TP_ARGS(dp, name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, dp_ino) + __field(int, namelen) + __dynamic_array(char, name, name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(dp)->i_sb->s_dev; + __entry->dp_ino = dp->i_ino; + __entry->namelen = name->len; + memcpy(__get_str(name), name->name, name->len); + ), + TP_printk("dev %d:%d dp ino 0x%llx name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dp_ino, + __entry->namelen, + __get_str(name)) +) + +#define DEFINE_NAMESPACE_EVENT(name) \ +DEFINE_EVENT(xfs_namespace_class, name, \ + TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \ + TP_ARGS(dp, name)) +DEFINE_NAMESPACE_EVENT(xfs_remove); +DEFINE_NAMESPACE_EVENT(xfs_link); +DEFINE_NAMESPACE_EVENT(xfs_lookup); +DEFINE_NAMESPACE_EVENT(xfs_create); +DEFINE_NAMESPACE_EVENT(xfs_symlink); + +TRACE_EVENT(xfs_rename, + TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp, + struct xfs_name *src_name, struct xfs_name *target_name), + TP_ARGS(src_dp, target_dp, src_name, target_name), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, src_dp_ino) + __field(xfs_ino_t, target_dp_ino) + __field(int, src_namelen) + __field(int, target_namelen) + __dynamic_array(char, src_name, src_name->len) + __dynamic_array(char, target_name, target_name->len) + ), + TP_fast_assign( + __entry->dev = VFS_I(src_dp)->i_sb->s_dev; + __entry->src_dp_ino = src_dp->i_ino; + __entry->target_dp_ino = target_dp->i_ino; + __entry->src_namelen = src_name->len; + __entry->target_namelen = target_name->len; + memcpy(__get_str(src_name), src_name->name, src_name->len); + memcpy(__get_str(target_name), target_name->name, + target_name->len); + ), + TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx" + " src name %.*s target name %.*s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->src_dp_ino, + __entry->target_dp_ino, + __entry->src_namelen, + __get_str(src_name), + __entry->target_namelen, + __get_str(target_name)) +) + +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) + ), \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = be32_to_cpu(dqp->q_core.d_id); + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx " + "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->id, + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) +) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) +DEFINE_DQUOT_EVENT(xfs_dqadjust); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_want); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy); +DEFINE_DQUOT_EVENT(xfs_dqreclaim_done); +DEFINE_DQUOT_EVENT(xfs_dqattach_found); +DEFINE_DQUOT_EVENT(xfs_dqattach_get); +DEFINE_DQUOT_EVENT(xfs_dqalloc); +DEFINE_DQUOT_EVENT(xfs_dqtobp_read); +DEFINE_DQUOT_EVENT(xfs_dqread); +DEFINE_DQUOT_EVENT(xfs_dqread_fail); +DEFINE_DQUOT_EVENT(xfs_dqget_hit); +DEFINE_DQUOT_EVENT(xfs_dqget_miss); +DEFINE_DQUOT_EVENT(xfs_dqget_freeing); +DEFINE_DQUOT_EVENT(xfs_dqget_dup); +DEFINE_DQUOT_EVENT(xfs_dqput); +DEFINE_DQUOT_EVENT(xfs_dqput_wait); +DEFINE_DQUOT_EVENT(xfs_dqput_free); +DEFINE_DQUOT_EVENT(xfs_dqrele); +DEFINE_DQUOT_EVENT(xfs_dqflush); +DEFINE_DQUOT_EVENT(xfs_dqflush_force); +DEFINE_DQUOT_EVENT(xfs_dqflush_done); + +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(int, reserveq) + __field(int, writeq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserveq = list_empty(&log->l_reserve_head.waiters); + __entry->writeq = list_empty(&log->l_write_head.waiters); + xlog_crack_grant_head(&log->l_reserve_head.grant, + &__entry->grant_reserve_cycle, + &__entry->grant_reserve_bytes); + xlog_crack_grant_head(&log->l_write_head.grant, + &__entry->grant_write_cycle, + &__entry->grant_write_bytes); + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = atomic64_read(&log->l_tail_lsn); + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserveq %s " + "writeq %s grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserveq ? "empty" : "active", + __entry->writeq ? "empty" : "active", + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) + +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \ + TP_ARGS(log, tic)) +DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); +DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); +DEFINE_LOGGRANT_EVENT(xfs_log_umount_write); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake); +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve); +DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); +DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); + +DECLARE_EVENT_CLASS(xfs_log_item_class, + TP_PROTO(struct xfs_log_item *lip), + TP_ARGS(lip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->lsn = lip->li_lsn; + ), + TP_printk("dev %d:%d lip 0x%p lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->lsn), BLOCK_LSN(__entry->lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +TRACE_EVENT(xfs_log_force, + TP_PROTO(struct xfs_mount *mp, xfs_lsn_t lsn), + TP_ARGS(mp, lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->lsn = lsn; + ), + TP_printk("dev %d:%d lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lsn) +) + +#define DEFINE_LOG_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_log_item_class, name, \ + TP_PROTO(struct xfs_log_item *lip), \ + TP_ARGS(lip)) +DEFINE_LOG_ITEM_EVENT(xfs_ail_push); +DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); +DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); +DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing); + +DECLARE_EVENT_CLASS(xfs_ail_class, + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), + TP_ARGS(lip, old_lsn, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(void *, lip) + __field(uint, type) + __field(uint, flags) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, new_lsn) + ), + TP_fast_assign( + __entry->dev = lip->li_mountp->m_super->s_dev; + __entry->lip = lip; + __entry->type = lip->li_type; + __entry->flags = lip->li_flags; + __entry->old_lsn = old_lsn; + __entry->new_lsn = new_lsn; + ), + TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->lip, + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __print_flags(__entry->flags, "|", XFS_LI_FLAGS)) +) + +#define DEFINE_AIL_EVENT(name) \ +DEFINE_EVENT(xfs_ail_class, name, \ + TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \ + TP_ARGS(lip, old_lsn, new_lsn)) +DEFINE_AIL_EVENT(xfs_ail_insert); +DEFINE_AIL_EVENT(xfs_ail_move); +DEFINE_AIL_EVENT(xfs_ail_delete); + +TRACE_EVENT(xfs_log_assign_tail_lsn, + TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn), + TP_ARGS(log, new_lsn), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_lsn_t, new_lsn) + __field(xfs_lsn_t, old_lsn) + __field(xfs_lsn_t, last_sync_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->new_lsn = new_lsn; + __entry->old_lsn = atomic64_read(&log->l_tail_lsn); + __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn); + ), + TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn), + CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn), + CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn)) +) + +DECLARE_EVENT_CLASS(xfs_file_class, + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), + TP_ARGS(ip, count, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx " + "offset 0x%llx count 0x%zx ioflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) +) + +#define DEFINE_RW_EVENT(name) \ +DEFINE_EVENT(xfs_file_class, name, \ + TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ + TP_ARGS(ip, count, offset, flags)) +DEFINE_RW_EVENT(xfs_file_read); +DEFINE_RW_EVENT(xfs_file_buffered_write); +DEFINE_RW_EVENT(xfs_file_direct_write); +DEFINE_RW_EVENT(xfs_file_splice_read); + +DECLARE_EVENT_CLASS(xfs_page_class, + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, + unsigned int len), + TP_ARGS(inode, page, off, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(pgoff_t, pgoff) + __field(loff_t, size) + __field(unsigned long, offset) + __field(unsigned int, length) + __field(int, delalloc) + __field(int, unwritten) + ), + TP_fast_assign( + int delalloc = -1, unwritten = -1; + + if (page_has_buffers(page)) + xfs_count_page_state(page, &delalloc, &unwritten); + __entry->dev = inode->i_sb->s_dev; + __entry->ino = XFS_I(inode)->i_ino; + __entry->pgoff = page_offset(page); + __entry->size = i_size_read(inode); + __entry->offset = off; + __entry->length = len; + __entry->delalloc = delalloc; + __entry->unwritten = unwritten; + ), + TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " + "length %x delalloc %d unwritten %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->size, + __entry->offset, + __entry->length, + __entry->delalloc, + __entry->unwritten) +) + +#define DEFINE_PAGE_EVENT(name) \ +DEFINE_EVENT(xfs_page_class, name, \ + TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \ + unsigned int len), \ + TP_ARGS(inode, page, off, len)) +DEFINE_PAGE_EVENT(xfs_writepage); +DEFINE_PAGE_EVENT(xfs_releasepage); +DEFINE_PAGE_EVENT(xfs_invalidatepage); + +DECLARE_EVENT_CLASS(xfs_imap_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, + int type, struct xfs_bmbt_irec *irec), + TP_ARGS(ip, offset, count, type, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, size) + __field(loff_t, offset) + __field(size_t, count) + __field(int, type) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + __entry->type = type; + __entry->startoff = irec ? irec->br_startoff : 0; + __entry->startblock = irec ? irec->br_startblock : 0; + __entry->blockcount = irec ? irec->br_blockcount : 0; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " + "type %s startoff 0x%llx startblock %lld blockcount 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->offset, + __entry->count, + __print_symbolic(__entry->type, XFS_IO_TYPES), + __entry->startoff, + (__int64_t)__entry->startblock, + __entry->blockcount) +) + +#define DEFINE_IOMAP_EVENT(name) \ +DEFINE_EVENT(xfs_imap_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ + int type, struct xfs_bmbt_irec *irec), \ + TP_ARGS(ip, offset, count, type, irec)) +DEFINE_IOMAP_EVENT(xfs_map_blocks_found); +DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_get_blocks_found); +DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); + +DECLARE_EVENT_CLASS(xfs_simple_io_class, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(loff_t, isize) + __field(loff_t, disize) + __field(loff_t, offset) + __field(size_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->isize = VFS_I(ip)->i_size; + __entry->disize = ip->i_d.di_size; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " + "offset 0x%llx count %zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->isize, + __entry->disize, + __entry->offset, + __entry->count) +); + +#define DEFINE_SIMPLE_IO_EVENT(name) \ +DEFINE_EVENT(xfs_simple_io_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ + TP_ARGS(ip, offset, count)) +DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); +DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); +DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); +DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); + +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + +#define DEFINE_ITRUNC_EVENT(name) \ +DEFINE_EVENT(xfs_itrunc_class, name, \ + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ + TP_ARGS(ip, new_size)) +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start); +DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end); + +TRACE_EVENT(xfs_pagecache_inval, + TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), + TP_ARGS(ip, start, finish), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_off_t, start) + __field(xfs_off_t, finish) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->start = start; + __entry->finish = finish; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->start, + __entry->finish) +); + +TRACE_EVENT(xfs_bunmap, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + int flags, unsigned long caller_ip), + TP_ARGS(ip, bno, len, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fileoff_t, bno) + __field(xfs_filblks_t, len) + __field(unsigned long, caller_ip) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->bno = bno; + __entry->len = len; + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + "flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->bno, + __entry->len, + __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), + (void *)__entry->caller_ip) + +); + +DECLARE_EVENT_CLASS(xfs_extent_busy_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_extent_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_extent_busy); +DEFINE_BUSY_EVENT(xfs_extent_busy_enomem); +DEFINE_BUSY_EVENT(xfs_extent_busy_force); +DEFINE_BUSY_EVENT(xfs_extent_busy_reuse); +DEFINE_BUSY_EVENT(xfs_extent_busy_clear); + +TRACE_EVENT(xfs_extent_busy_trim, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->tbno = tbno; + __entry->tlen = tlen; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->tbno, + __entry->tlen) +); + +TRACE_EVENT(xfs_trans_commit_lsn, + TP_PROTO(struct xfs_trans *trans), + TP_ARGS(trans), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(struct xfs_trans *, tp) + __field(xfs_lsn_t, lsn) + ), + TP_fast_assign( + __entry->dev = trans->t_mountp->m_super->s_dev; + __entry->tp = trans; + __entry->lsn = trans->t_commit_lsn; + ), + TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tp, + __entry->lsn) +); + +TRACE_EVENT(xfs_agf, + TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags, + unsigned long caller_ip), + TP_ARGS(mp, agf, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, flags) + __field(__u32, length) + __field(__u32, bno_root) + __field(__u32, cnt_root) + __field(__u32, bno_level) + __field(__u32, cnt_level) + __field(__u32, flfirst) + __field(__u32, fllast) + __field(__u32, flcount) + __field(__u32, freeblks) + __field(__u32, longest) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = be32_to_cpu(agf->agf_seqno), + __entry->flags = flags; + __entry->length = be32_to_cpu(agf->agf_length), + __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]), + __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]), + __entry->bno_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]), + __entry->cnt_level = + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]), + __entry->flfirst = be32_to_cpu(agf->agf_flfirst), + __entry->fllast = be32_to_cpu(agf->agf_fllast), + __entry->flcount = be32_to_cpu(agf->agf_flcount), + __entry->freeblks = be32_to_cpu(agf->agf_freeblks), + __entry->longest = be32_to_cpu(agf->agf_longest); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + "levels b %u c %u flfirst %u fllast %u flcount %u " + "freeblks %u longest %u caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), + __entry->length, + __entry->bno_root, + __entry->cnt_root, + __entry->bno_level, + __entry->cnt_level, + __entry->flfirst, + __entry->fllast, + __entry->flcount, + __entry->freeblks, + __entry->longest, + (void *)__entry->caller_ip) +); + +TRACE_EVENT(xfs_free_extent, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_extlen_t len, bool isfl, int haveleft, int haveright), + TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(int, isfl) + __field(int, haveleft) + __field(int, haveright) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->isfl = isfl; + __entry->haveleft = haveleft; + __entry->haveright = haveright; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->isfl, + __entry->haveleft ? + (__entry->haveright ? "both" : "left") : + (__entry->haveright ? "right" : "none")) + +); + +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + (unsigned long long)__entry->firstblock) +) + +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) +DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound); +DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_near_first); +DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); +DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); +DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); +DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); +DEFINE_ALLOC_EVENT(xfs_alloc_size_done); +DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); +DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); +DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); +DEFINE_ALLOC_EVENT(xfs_alloc_small_done); +DEFINE_ALLOC_EVENT(xfs_alloc_small_error); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); +DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); + +DECLARE_EVENT_CLASS(xfs_da_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_attr_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(int, valuelen) + __field(xfs_dahash_t, hashval) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->valuelen = args->valuelen; + __entry->hashval = args->hashval; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " + "hashval 0x%x op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->valuelen, + __entry->hashval, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_ATTR_EVENT(name) \ +DEFINE_EVENT(xfs_attr_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_ATTR_EVENT(xfs_attr_sf_add); +DEFINE_ATTR_EVENT(xfs_attr_sf_addname); +DEFINE_ATTR_EVENT(xfs_attr_sf_create); +DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_sf_remove); +DEFINE_ATTR_EVENT(xfs_attr_sf_removename); +DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); + +DEFINE_ATTR_EVENT(xfs_attr_leaf_add); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new); +DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work); +DEFINE_ATTR_EVENT(xfs_attr_leaf_addname); +DEFINE_ATTR_EVENT(xfs_attr_leaf_create); +DEFINE_ATTR_EVENT(xfs_attr_leaf_compact); +DEFINE_ATTR_EVENT(xfs_attr_leaf_get); +DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup); +DEFINE_ATTR_EVENT(xfs_attr_leaf_replace); +DEFINE_ATTR_EVENT(xfs_attr_leaf_remove); +DEFINE_ATTR_EVENT(xfs_attr_leaf_removename); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before); +DEFINE_ATTR_EVENT(xfs_attr_leaf_split_after); +DEFINE_ATTR_EVENT(xfs_attr_leaf_clearflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_setflag); +DEFINE_ATTR_EVENT(xfs_attr_leaf_flipflags); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf); +DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node); +DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance); +DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); + +DEFINE_ATTR_EVENT(xfs_attr_node_addname); +DEFINE_ATTR_EVENT(xfs_attr_node_get); +DEFINE_ATTR_EVENT(xfs_attr_node_lookup); +DEFINE_ATTR_EVENT(xfs_attr_node_replace); +DEFINE_ATTR_EVENT(xfs_attr_node_removename); + +DEFINE_ATTR_EVENT(xfs_attr_fillstate); +DEFINE_ATTR_EVENT(xfs_attr_refillstate); + +DEFINE_ATTR_EVENT(xfs_attr_rmtval_get); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_set); +DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove); + +#define DEFINE_DA_EVENT(name) \ +DEFINE_EVENT(xfs_da_class, name, \ + TP_PROTO(struct xfs_da_args *args), \ + TP_ARGS(args)) +DEFINE_DA_EVENT(xfs_da_split); +DEFINE_DA_EVENT(xfs_da_join); +DEFINE_DA_EVENT(xfs_da_link_before); +DEFINE_DA_EVENT(xfs_da_link_after); +DEFINE_DA_EVENT(xfs_da_unlink_back); +DEFINE_DA_EVENT(xfs_da_unlink_forward); +DEFINE_DA_EVENT(xfs_da_root_split); +DEFINE_DA_EVENT(xfs_da_root_join); +DEFINE_DA_EVENT(xfs_da_node_add); +DEFINE_DA_EVENT(xfs_da_node_create); +DEFINE_DA_EVENT(xfs_da_node_split); +DEFINE_DA_EVENT(xfs_da_node_remove); +DEFINE_DA_EVENT(xfs_da_node_rebalance); +DEFINE_DA_EVENT(xfs_da_node_unbalance); +DEFINE_DA_EVENT(xfs_da_node_toosmall); +DEFINE_DA_EVENT(xfs_da_swap_lastblock); +DEFINE_DA_EVENT(xfs_da_grow_inode); +DEFINE_DA_EVENT(xfs_da_shrink_inode); +DEFINE_DA_EVENT(xfs_da_fixhashpath); +DEFINE_DA_EVENT(xfs_da_path_shift); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) +) + +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ + TP_PROTO(struct xfs_da_args *args, int idx), \ + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); + +TRACE_EVENT(xfs_dir2_leafn_moveents, + TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), + TP_ARGS(args, src_idx, dst_idx, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, src_idx) + __field(int, dst_idx) + __field(int, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->src_idx = src_idx; + __entry->dst_idx = dst_idx; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s " + "src_idx %d dst_idx %d count %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->src_idx, + __entry->dst_idx, + __entry->count) +); + +#define XFS_SWAPEXT_INODES \ + { 0, "target" }, \ + { 1, "temp" } + +#define XFS_INODE_FORMAT_STR \ + { 0, "invalid" }, \ + { 1, "local" }, \ + { 2, "extent" }, \ + { 3, "btree" } + +DECLARE_EVENT_CLASS(xfs_swap_extent_class, + TP_PROTO(struct xfs_inode *ip, int which), + TP_ARGS(ip, which), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, which) + __field(xfs_ino_t, ino) + __field(int, format) + __field(int, nex) + __field(int, broot_size) + __field(int, fork_off) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->which = which; + __entry->ino = ip->i_ino; + __entry->format = ip->i_d.di_format; + __entry->nex = ip->i_d.di_nextents; + __entry->broot_size = ip->i_df.if_broot_bytes; + __entry->fork_off = XFS_IFORK_BOFF(ip); + ), + TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " + "broot size %d, fork offset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), + __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), + __entry->nex, + __entry->broot_size, + __entry->fork_off) +) + +#define DEFINE_SWAPEXT_EVENT(name) \ +DEFINE_EVENT(xfs_swap_extent_class, name, \ + TP_PROTO(struct xfs_inode *ip, int which), \ + TP_ARGS(ip, which)) + +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); +DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); + +DECLARE_EVENT_CLASS(xfs_log_recover_item_class, + TP_PROTO(struct xlog *log, struct xlog_recover *trans, + struct xlog_recover_item *item, int pass), + TP_ARGS(log, trans, item, pass), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, item) + __field(xlog_tid_t, tid) + __field(int, type) + __field(int, pass) + __field(int, count) + __field(int, total) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->item = (unsigned long)item; + __entry->tid = trans->r_log_tid; + __entry->type = ITEM_TYPE(item); + __entry->pass = pass; + __entry->count = item->ri_cnt; + __entry->total = item->ri_total; + ), + TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s " + "item region count/total %d/%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->pass, + (void *)__entry->item, + __print_symbolic(__entry->type, XFS_LI_TYPE_DESC), + __entry->count, + __entry->total) +) + +#define DEFINE_LOG_RECOVER_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_item_class, name, \ + TP_PROTO(struct xlog *log, struct xlog_recover *trans, \ + struct xlog_recover_item *item, int pass), \ + TP_ARGS(log, trans, item, pass)) + +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail); +DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover); + +DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), + TP_ARGS(log, buf_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__int64_t, blkno) + __field(unsigned short, len) + __field(unsigned short, flags) + __field(unsigned short, size) + __field(unsigned int, map_size) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->blkno = buf_f->blf_blkno; + __entry->len = buf_f->blf_len; + __entry->flags = buf_f->blf_flags; + __entry->size = buf_f->blf_size; + __entry->map_size = buf_f->blf_map_size; + ), + TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + "map_size %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->blkno, + __entry->len, + __entry->flags, + __entry->size, + __entry->map_size) +) + +#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_buf_log_format *buf_f), \ + TP_ARGS(log, buf_f)) + +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf); +DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf); + +DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), + TP_ARGS(log, in_f), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(unsigned short, size) + __field(int, fields) + __field(unsigned short, asize) + __field(unsigned short, dsize) + __field(__int64_t, blkno) + __field(int, len) + __field(int, boffset) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->ino = in_f->ilf_ino; + __entry->size = in_f->ilf_size; + __entry->fields = in_f->ilf_fields; + __entry->asize = in_f->ilf_asize; + __entry->dsize = in_f->ilf_dsize; + __entry->blkno = in_f->ilf_blkno; + __entry->len = in_f->ilf_len; + __entry->boffset = in_f->ilf_boffset; + ), + TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " + "dsize %d, blkno 0x%llx, len %d, boffset %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->fields, + __entry->asize, + __entry->dsize, + __entry->blkno, + __entry->len, + __entry->boffset) +) +#define DEFINE_LOG_RECOVER_INO_ITEM(name) \ +DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \ + TP_PROTO(struct xlog *log, struct xfs_inode_log_format *in_f), \ + TP_ARGS(log, in_f)) + +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel); +DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip); + +DECLARE_EVENT_CLASS(xfs_discard_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +) + +#define DEFINE_DISCARD_EVENT(name) \ +DEFINE_EVENT(xfs_discard_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_DISCARD_EVENT(xfs_discard_extent); +DEFINE_DISCARD_EVENT(xfs_discard_toosmall); +DEFINE_DISCARD_EVENT(xfs_discard_exclude); +DEFINE_DISCARD_EVENT(xfs_discard_busy); + +#endif /* _TRACE_XFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE xfs_trace +#include <trace/define_trace.h> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ee2721e0de4..d03932564cc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,175 +18,24 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" +#include "xfs_extent_busy.h" #include "xfs_quota.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" -#include "xfs_trans_space.h" - - -STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *); -STATIC uint xfs_trans_count_vecs(xfs_trans_t *); -STATIC void xfs_trans_fill_vecs(xfs_trans_t *, xfs_log_iovec_t *); -STATIC void xfs_trans_uncommit(xfs_trans_t *, uint); -STATIC void xfs_trans_committed(xfs_trans_t *, int); -STATIC void xfs_trans_chunk_committed(xfs_log_item_chunk_t *, xfs_lsn_t, int); -STATIC void xfs_trans_free(xfs_trans_t *); +#include "xfs_log.h" +#include "xfs_trace.h" +#include "xfs_error.h" kmem_zone_t *xfs_trans_zone; - - -/* - * Reservation functions here avoid a huge stack in xfs_trans_init - * due to register overflow from temporaries in the calculations. - */ - -STATIC uint -xfs_calc_write_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_itruncate_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_rename_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_link_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_remove_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_symlink_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_create_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_mkdir_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_ifree_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_ichange_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_growdata_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_GROWDATA_LOG_RES(mp); -} - -STATIC uint -xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_GROWRTALLOC_LOG_RES(mp); -} - -STATIC uint -xfs_calc_growrtzero_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_GROWRTZERO_LOG_RES(mp); -} - -STATIC uint -xfs_calc_growrtfree_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_GROWRTFREE_LOG_RES(mp); -} - -STATIC uint -xfs_calc_swrite_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_SWRITE_LOG_RES(mp); -} - -STATIC uint -xfs_calc_writeid_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_WRITEID_LOG_RES(mp); -} - -STATIC uint -xfs_calc_addafork_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_attrinval_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ATTRINVAL_LOG_RES(mp); -} - -STATIC uint -xfs_calc_attrset_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_attrrm_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); -} - -STATIC uint -xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) -{ - return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); -} +kmem_zone_t *xfs_log_item_desc_zone; /* * Initialize the precomputed transaction reservation values @@ -193,32 +43,9 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) */ void xfs_trans_init( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_trans_reservations_t *resp; - - resp = &(mp->m_reservations); - resp->tr_write = xfs_calc_write_reservation(mp); - resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); - resp->tr_rename = xfs_calc_rename_reservation(mp); - resp->tr_link = xfs_calc_link_reservation(mp); - resp->tr_remove = xfs_calc_remove_reservation(mp); - resp->tr_symlink = xfs_calc_symlink_reservation(mp); - resp->tr_create = xfs_calc_create_reservation(mp); - resp->tr_mkdir = xfs_calc_mkdir_reservation(mp); - resp->tr_ifree = xfs_calc_ifree_reservation(mp); - resp->tr_ichange = xfs_calc_ichange_reservation(mp); - resp->tr_growdata = xfs_calc_growdata_reservation(mp); - resp->tr_swrite = xfs_calc_swrite_reservation(mp); - resp->tr_writeid = xfs_calc_writeid_reservation(mp); - resp->tr_addafork = xfs_calc_addafork_reservation(mp); - resp->tr_attrinval = xfs_calc_attrinval_reservation(mp); - resp->tr_attrset = xfs_calc_attrset_reservation(mp); - resp->tr_attrrm = xfs_calc_attrrm_reservation(mp); - resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp); - resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp); - resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp); - resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp); + xfs_trans_resv_calc(mp, M_RES(mp)); } /* @@ -234,31 +61,53 @@ xfs_trans_alloc( xfs_mount_t *mp, uint type) { - vfs_wait_for_freeze(XFS_MTOVFS(mp), SB_FREEZE_TRANS); - return _xfs_trans_alloc(mp, type); + xfs_trans_t *tp; + + sb_start_intwrite(mp->m_super); + tp = _xfs_trans_alloc(mp, type, KM_SLEEP); + tp->t_flags |= XFS_TRANS_FREEZE_PROT; + return tp; } xfs_trans_t * _xfs_trans_alloc( xfs_mount_t *mp, - uint type) + uint type, + xfs_km_flags_t memflags) { xfs_trans_t *tp; + WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); atomic_inc(&mp->m_active_trans); - tp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); - tp->t_magic = XFS_TRANS_MAGIC; + tp = kmem_zone_zalloc(xfs_trans_zone, memflags); + tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_type = type; tp->t_mountp = mp; - tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(tp->t_items)); - XFS_LBC_INIT(&(tp->t_busy)); + INIT_LIST_HEAD(&tp->t_items); + INIT_LIST_HEAD(&tp->t_busy); return tp; } /* + * Free the transaction structure. If there is more clean up + * to do when the structure is freed, add it here. + */ +STATIC void +xfs_trans_free( + struct xfs_trans *tp) +{ + xfs_extent_busy_sort(&tp->t_busy); + xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + + atomic_dec(&tp->t_mountp->m_active_trans); + if (tp->t_flags & XFS_TRANS_FREEZE_PROT) + sb_end_intwrite(tp->t_mountp->m_super); + xfs_trans_free_dqinfo(tp); + kmem_zone_free(xfs_trans_zone, tp); +} + +/* * This is called to create a new transaction which will share the * permanent log reservation of the given transaction. The remaining * unused block and rt extent reservations are also inherited. This @@ -277,26 +126,28 @@ xfs_trans_dup( /* * Initialize the new transaction structure. */ - ntp->t_magic = XFS_TRANS_MAGIC; + ntp->t_magic = XFS_TRANS_HEADER_MAGIC; ntp->t_type = tp->t_type; ntp->t_mountp = tp->t_mountp; - ntp->t_items_free = XFS_LIC_NUM_SLOTS; - ntp->t_busy_free = XFS_LBC_NUM_SLOTS; - XFS_LIC_INIT(&(ntp->t_items)); - XFS_LBC_INIT(&(ntp->t_busy)); + INIT_LIST_HEAD(&ntp->t_items); + INIT_LIST_HEAD(&ntp->t_busy); ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(tp->t_ticket != NULL); - ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE); - ntp->t_ticket = tp->t_ticket; + ntp->t_flags = XFS_TRANS_PERM_LOG_RES | + (tp->t_flags & XFS_TRANS_RESERVE) | + (tp->t_flags & XFS_TRANS_FREEZE_PROT); + /* We gave our writer reference to the new transaction */ + tp->t_flags &= ~XFS_TRANS_FREEZE_PROT; + ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket); ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used; tp->t_blk_res = tp->t_blk_res_used; ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; - XFS_TRANS_DUP_DQINFO(tp->t_mountp, tp, ntp); + xfs_trans_dup_dqinfo(tp, ntp); atomic_inc(&tp->t_mountp->m_active_trans); return ntp; @@ -318,14 +169,11 @@ xfs_trans_dup( */ int xfs_trans_reserve( - xfs_trans_t *tp, - uint blocks, - uint logspace, - uint rtextents, - uint flags, - uint logcount) + struct xfs_trans *tp, + struct xfs_trans_res *resp, + uint blocks, + uint rtextents) { - int log_flags; int error = 0; int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; @@ -338,8 +186,8 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - -blocks, rsvd); + error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, + -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return (XFS_ERROR(ENOSPC)); @@ -350,27 +198,38 @@ xfs_trans_reserve( /* * Reserve the log space needed for this transaction. */ - if (logspace > 0) { - ASSERT((tp->t_log_res == 0) || (tp->t_log_res == logspace)); - ASSERT((tp->t_log_count == 0) || - (tp->t_log_count == logcount)); - if (flags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_PERM_RESERV; + if (resp->tr_logres > 0) { + bool permanent = false; + + ASSERT(tp->t_log_res == 0 || + tp->t_log_res == resp->tr_logres); + ASSERT(tp->t_log_count == 0 || + tp->t_log_count == resp->tr_logcount); + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { tp->t_flags |= XFS_TRANS_PERM_LOG_RES; + permanent = true; } else { ASSERT(tp->t_ticket == NULL); ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES)); - log_flags = 0; } - error = xfs_log_reserve(tp->t_mountp, logspace, logcount, - &tp->t_ticket, - XFS_TRANSACTION, log_flags, tp->t_type); - if (error) { - goto undo_blocks; + if (tp->t_ticket != NULL) { + ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES); + error = xfs_log_regrant(tp->t_mountp, tp->t_ticket); + } else { + error = xfs_log_reserve(tp->t_mountp, + resp->tr_logres, + resp->tr_logcount, + &tp->t_ticket, XFS_TRANSACTION, + permanent, tp->t_type); } - tp->t_log_res = logspace; - tp->t_log_count = logcount; + + if (error) + goto undo_blocks; + + tp->t_log_res = resp->tr_logres; + tp->t_log_count = resp->tr_logcount; } /* @@ -380,7 +239,7 @@ xfs_trans_reserve( */ if (rtextents > 0) { error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -rtextents, rsvd); + -((int64_t)rtextents), rsvd); if (error) { error = XFS_ERROR(ENOSPC); goto undo_log; @@ -395,8 +254,10 @@ xfs_trans_reserve( * reservations which have already been performed. */ undo_log: - if (logspace > 0) { - if (flags & XFS_TRANS_PERM_LOG_RES) { + if (resp->tr_logres > 0) { + int log_flags; + + if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { log_flags = XFS_LOG_REL_PERM_RESERV; } else { log_flags = 0; @@ -409,8 +270,8 @@ undo_log: undo_blocks: if (blocks > 0) { - (void) xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FDBLOCKS, - blocks, rsvd); + xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, + (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -419,7 +280,6 @@ undo_blocks: return error; } - /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -427,20 +287,34 @@ undo_blocks: * * Mark the transaction structure to indicate that the superblock * needs to be updated before committing. + * + * Because we may not be keeping track of allocated/free inodes and + * used filesystem blocks in the superblock, we do not mark the + * superblock dirty in this transaction if we modify these fields. + * We still need to update the transaction deltas so that they get + * applied to the incore superblock, but we don't want them to + * cause the superblock to get locked and logged if these are the + * only fields in the superblock that the transaction modifies. */ void xfs_trans_mod_sb( xfs_trans_t *tp, uint field, - long delta) + int64_t delta) { + uint32_t flags = (XFS_TRANS_DIRTY|XFS_TRANS_SB_DIRTY); + xfs_mount_t *mp = tp->t_mountp; switch (field) { case XFS_TRANS_SB_ICOUNT: tp->t_icount_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_IFREE: tp->t_ifree_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FDBLOCKS: /* @@ -453,6 +327,8 @@ xfs_trans_mod_sb( ASSERT(tp->t_blk_res_used <= tp->t_blk_res); } tp->t_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FDBLOCKS: /* @@ -462,6 +338,8 @@ xfs_trans_mod_sb( */ ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; + if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FREXTENTS: /* @@ -515,7 +393,7 @@ xfs_trans_mod_sb( return; } - tp->t_flags |= (XFS_TRANS_SB_DIRTY | XFS_TRANS_DIRTY); + tp->t_flags |= flags; } /* @@ -530,7 +408,7 @@ STATIC void xfs_trans_apply_sb_deltas( xfs_trans_t *tp) { - xfs_sb_t *sbp; + xfs_dsb_t *sbp; xfs_buf_t *bp; int whole = 0; @@ -544,56 +422,55 @@ xfs_trans_apply_sb_deltas( (tp->t_ag_freeblks_delta + tp->t_ag_flist_delta + tp->t_ag_btree_delta)); - if (tp->t_icount_delta != 0) { - INT_MOD(sbp->sb_icount, ARCH_CONVERT, tp->t_icount_delta); - } - if (tp->t_ifree_delta != 0) { - INT_MOD(sbp->sb_ifree, ARCH_CONVERT, tp->t_ifree_delta); + /* + * Only update the superblock counters if we are logging them + */ + if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (tp->t_icount_delta) + be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); + if (tp->t_ifree_delta) + be64_add_cpu(&sbp->sb_ifree, tp->t_ifree_delta); + if (tp->t_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_fdblocks_delta); + if (tp->t_res_fdblocks_delta) + be64_add_cpu(&sbp->sb_fdblocks, tp->t_res_fdblocks_delta); } - if (tp->t_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_fdblocks_delta); - } - if (tp->t_res_fdblocks_delta != 0) { - INT_MOD(sbp->sb_fdblocks, ARCH_CONVERT, tp->t_res_fdblocks_delta); - } + if (tp->t_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_frextents_delta); + if (tp->t_res_frextents_delta) + be64_add_cpu(&sbp->sb_frextents, tp->t_res_frextents_delta); - if (tp->t_frextents_delta != 0) { - INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_frextents_delta); - } - if (tp->t_res_frextents_delta != 0) { - INT_MOD(sbp->sb_frextents, ARCH_CONVERT, tp->t_res_frextents_delta); - } - if (tp->t_dblocks_delta != 0) { - INT_MOD(sbp->sb_dblocks, ARCH_CONVERT, tp->t_dblocks_delta); + if (tp->t_dblocks_delta) { + be64_add_cpu(&sbp->sb_dblocks, tp->t_dblocks_delta); whole = 1; } - if (tp->t_agcount_delta != 0) { - INT_MOD(sbp->sb_agcount, ARCH_CONVERT, tp->t_agcount_delta); + if (tp->t_agcount_delta) { + be32_add_cpu(&sbp->sb_agcount, tp->t_agcount_delta); whole = 1; } - if (tp->t_imaxpct_delta != 0) { - INT_MOD(sbp->sb_imax_pct, ARCH_CONVERT, tp->t_imaxpct_delta); + if (tp->t_imaxpct_delta) { + sbp->sb_imax_pct += tp->t_imaxpct_delta; whole = 1; } - if (tp->t_rextsize_delta != 0) { - INT_MOD(sbp->sb_rextsize, ARCH_CONVERT, tp->t_rextsize_delta); + if (tp->t_rextsize_delta) { + be32_add_cpu(&sbp->sb_rextsize, tp->t_rextsize_delta); whole = 1; } - if (tp->t_rbmblocks_delta != 0) { - INT_MOD(sbp->sb_rbmblocks, ARCH_CONVERT, tp->t_rbmblocks_delta); + if (tp->t_rbmblocks_delta) { + be32_add_cpu(&sbp->sb_rbmblocks, tp->t_rbmblocks_delta); whole = 1; } - if (tp->t_rblocks_delta != 0) { - INT_MOD(sbp->sb_rblocks, ARCH_CONVERT, tp->t_rblocks_delta); + if (tp->t_rblocks_delta) { + be64_add_cpu(&sbp->sb_rblocks, tp->t_rblocks_delta); whole = 1; } - if (tp->t_rextents_delta != 0) { - INT_MOD(sbp->sb_rextents, ARCH_CONVERT, tp->t_rextents_delta); + if (tp->t_rextents_delta) { + be64_add_cpu(&sbp->sb_rextents, tp->t_rextents_delta); whole = 1; } - if (tp->t_rextslog_delta != 0) { - INT_MOD(sbp->sb_rextslog, ARCH_CONVERT, tp->t_rextslog_delta); + if (tp->t_rextslog_delta) { + sbp->sb_rextslog += tp->t_rextslog_delta; whole = 1; } @@ -601,124 +478,142 @@ xfs_trans_apply_sb_deltas( /* * Log the whole thing, the fields are noncontiguous. */ - xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); else /* * Since all the modifiable fields are contiguous, we * can get away with this. */ - xfs_trans_log_buf(tp, bp, offsetof(xfs_sb_t, sb_icount), - offsetof(xfs_sb_t, sb_frextents) + + xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), + offsetof(xfs_dsb_t, sb_frextents) + sizeof(sbp->sb_frextents) - 1); - - XFS_MTOVFS(tp->t_mountp)->vfs_super->s_dirt = 1; } /* - * xfs_trans_unreserve_and_mod_sb() is called to release unused - * reservations and apply superblock counter changes to the in-core - * superblock. + * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations + * and apply superblock counter changes to the in-core superblock. The + * t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT + * applied to the in-core superblock. The idea is that that has already been + * done. * * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). + * However, we have to ensure that we only modify each superblock field only + * once because the application of the delta values may not be atomic. That can + * lead to ENOSPC races occurring if we have two separate modifcations of the + * free space counter to put back the entire reservation and then take away + * what we used. + * + * If we are not logging superblock counters, then the inode allocated/free and + * used block counts are not updated in the on disk superblock. In this case, + * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we + * still need to update the incore superblock with the changes. */ -STATIC void +void xfs_trans_unreserve_and_mod_sb( xfs_trans_t *tp) { - xfs_mod_sb_t msb[14]; /* If you add cases, add entries */ + xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ xfs_mod_sb_t *msbp; + xfs_mount_t *mp = tp->t_mountp; /* REFERENCED */ int error; int rsvd; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; msbp = msb; rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* - * Release any reserved blocks. Any that were allocated - * will be taken back again by fdblocks_delta below. - */ - if (tp->t_blk_res > 0) { - msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = tp->t_blk_res; - msbp++; + /* calculate deltas */ + if (tp->t_blk_res > 0) + blkdelta = tp->t_blk_res; + if ((tp->t_fdblocks_delta != 0) && + (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY))) + blkdelta += tp->t_fdblocks_delta; + + if (tp->t_rtx_res > 0) + rtxdelta = tp->t_rtx_res; + if ((tp->t_frextents_delta != 0) && + (tp->t_flags & XFS_TRANS_SB_DIRTY)) + rtxdelta += tp->t_frextents_delta; + + if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (tp->t_flags & XFS_TRANS_SB_DIRTY)) { + idelta = tp->t_icount_delta; + ifreedelta = tp->t_ifree_delta; } - /* - * Release any reserved real time extents . Any that were - * allocated will be taken back again by frextents_delta below. - */ - if (tp->t_rtx_res > 0) { + /* apply the per-cpu counters */ + if (blkdelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, + blkdelta, rsvd); + if (error) + goto out; + } + + if (idelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, + idelta, rsvd); + if (error) + goto out_undo_fdblocks; + } + + if (ifreedelta) { + error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, + ifreedelta, rsvd); + if (error) + goto out_undo_icount; + } + + /* apply remaining deltas */ + if (rtxdelta != 0) { msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = tp->t_rtx_res; + msbp->msb_delta = rtxdelta; msbp++; } - /* - * Apply any superblock modifications to the in-core version. - * The t_res_fdblocks_delta and t_res_frextents_delta fields are - * explicitly NOT applied to the in-core superblock. - * The idea is that that has already been done. - */ if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_icount_delta != 0) { - msbp->msb_field = XFS_SBS_ICOUNT; - msbp->msb_delta = (int)tp->t_icount_delta; - msbp++; - } - if (tp->t_ifree_delta != 0) { - msbp->msb_field = XFS_SBS_IFREE; - msbp->msb_delta = (int)tp->t_ifree_delta; - msbp++; - } - if (tp->t_fdblocks_delta != 0) { - msbp->msb_field = XFS_SBS_FDBLOCKS; - msbp->msb_delta = (int)tp->t_fdblocks_delta; - msbp++; - } - if (tp->t_frextents_delta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = (int)tp->t_frextents_delta; - msbp++; - } if (tp->t_dblocks_delta != 0) { msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = (int)tp->t_dblocks_delta; + msbp->msb_delta = tp->t_dblocks_delta; msbp++; } if (tp->t_agcount_delta != 0) { msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = (int)tp->t_agcount_delta; + msbp->msb_delta = tp->t_agcount_delta; msbp++; } if (tp->t_imaxpct_delta != 0) { msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = (int)tp->t_imaxpct_delta; + msbp->msb_delta = tp->t_imaxpct_delta; msbp++; } if (tp->t_rextsize_delta != 0) { msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = (int)tp->t_rextsize_delta; + msbp->msb_delta = tp->t_rextsize_delta; msbp++; } if (tp->t_rbmblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = (int)tp->t_rbmblocks_delta; + msbp->msb_delta = tp->t_rbmblocks_delta; msbp++; } if (tp->t_rblocks_delta != 0) { msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = (int)tp->t_rblocks_delta; + msbp->msb_delta = tp->t_rblocks_delta; msbp++; } if (tp->t_rextents_delta != 0) { msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = (int)tp->t_rextents_delta; + msbp->msb_delta = tp->t_rextents_delta; msbp++; } if (tp->t_rextslog_delta != 0) { msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = (int)tp->t_rextslog_delta; + msbp->msb_delta = tp->t_rextslog_delta; msbp++; } } @@ -729,15 +624,215 @@ xfs_trans_unreserve_and_mod_sb( if (msbp > msb) { error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, (uint)(msbp - msb), rsvd); - ASSERT(error == 0); + if (error) + goto out_undo_ifreecount; + } + + return; + +out_undo_ifreecount: + if (ifreedelta) + xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); +out_undo_icount: + if (idelta) + xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); +out_undo_fdblocks: + if (blkdelta) + xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); +out: + ASSERT(error == 0); + return; +} + +/* + * Add the given log item to the transaction's list of log items. + * + * The log item will now point to its new descriptor with its li_desc field. + */ +void +xfs_trans_add_item( + struct xfs_trans *tp, + struct xfs_log_item *lip) +{ + struct xfs_log_item_desc *lidp; + + ASSERT(lip->li_mountp == tp->t_mountp); + ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + + lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); + + lidp->lid_item = lip; + lidp->lid_flags = 0; + list_add_tail(&lidp->lid_trans, &tp->t_items); + + lip->li_desc = lidp; +} + +STATIC void +xfs_trans_free_item_desc( + struct xfs_log_item_desc *lidp) +{ + list_del_init(&lidp->lid_trans); + kmem_zone_free(xfs_log_item_desc_zone, lidp); +} + +/* + * Unlink and free the given descriptor. + */ +void +xfs_trans_del_item( + struct xfs_log_item *lip) +{ + xfs_trans_free_item_desc(lip->li_desc); + lip->li_desc = NULL; +} + +/* + * Unlock all of the items of a transaction and free all the descriptors + * of that transaction. + */ +void +xfs_trans_free_items( + struct xfs_trans *tp, + xfs_lsn_t commit_lsn, + int flags) +{ + struct xfs_log_item_desc *lidp, *next; + + list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { + struct xfs_log_item *lip = lidp->lid_item; + + lip->li_desc = NULL; + + if (commit_lsn != NULLCOMMITLSN) + lip->li_ops->iop_committing(lip, commit_lsn); + if (flags & XFS_TRANS_ABORT) + lip->li_flags |= XFS_LI_ABORTED; + lip->li_ops->iop_unlock(lip); + + xfs_trans_free_item_desc(lidp); } } +static inline void +xfs_log_item_batch_insert( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t commit_lsn) +{ + int i; + + spin_lock(&ailp->xa_lock); + /* xfs_trans_ail_update_bulk drops ailp->xa_lock */ + xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + + lip->li_ops->iop_unpin(lip, 0); + } +} /* - * xfs_trans_commit + * Bulk operation version of xfs_trans_committed that takes a log vector of + * items to insert into the AIL. This uses bulk AIL insertion techniques to + * minimise lock traffic. + * + * If we are called with the aborted flag set, it is because a log write during + * a CIL checkpoint commit has failed. In this case, all the items in the + * checkpoint have already gone through iop_commited and iop_unlock, which + * means that checkpoint commit abort handling is treated exactly the same + * as an iclog write error even though we haven't started any IO yet. Hence in + * this case all we need to do is iop_committed processing, followed by an + * iop_unpin(aborted) call. * - * Commit the given transaction to the log a/synchronously. + * The AIL cursor is used to optimise the insert process. If commit_lsn is not + * at the end of the AIL, the insert cursor avoids the need to walk + * the AIL to find the insertion point on every xfs_log_item_batch_insert() + * call. This saves a lot of needless list walking and is a net win, even + * though it slightly increases that amount of AIL lock traffic to set it up + * and tear it down. + */ +void +xfs_trans_committed_bulk( + struct xfs_ail *ailp, + struct xfs_log_vec *log_vector, + xfs_lsn_t commit_lsn, + int aborted) +{ +#define LOG_ITEM_BATCH_SIZE 32 + struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; + struct xfs_log_vec *lv; + struct xfs_ail_cursor cur; + int i = 0; + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_last(ailp, &cur, commit_lsn); + spin_unlock(&ailp->xa_lock); + + /* unpin all the log items */ + for (lv = log_vector; lv; lv = lv->lv_next ) { + struct xfs_log_item *lip = lv->lv_item; + xfs_lsn_t item_lsn; + + if (aborted) + lip->li_flags |= XFS_LI_ABORTED; + item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); + + /* item_lsn of -1 means the item needs no further processing */ + if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) + continue; + + /* + * if we are aborting the operation, no point in inserting the + * object into the AIL as we are in a shutdown situation. + */ + if (aborted) { + ASSERT(XFS_FORCED_SHUTDOWN(ailp->xa_mount)); + lip->li_ops->iop_unpin(lip, 1); + continue; + } + + if (item_lsn != commit_lsn) { + + /* + * Not a bulk update option due to unusual item_lsn. + * Push into AIL immediately, rechecking the lsn once + * we have the ail lock. Then unpin the item. This does + * not affect the AIL cursor the bulk insert path is + * using. + */ + spin_lock(&ailp->xa_lock); + if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) + xfs_trans_ail_update(ailp, lip, item_lsn); + else + spin_unlock(&ailp->xa_lock); + lip->li_ops->iop_unpin(lip, 0); + continue; + } + + /* Item is a candidate for bulk AIL insert. */ + log_items[i++] = lv->lv_item; + if (i >= LOG_ITEM_BATCH_SIZE) { + xfs_log_item_batch_insert(ailp, &cur, log_items, + LOG_ITEM_BATCH_SIZE, commit_lsn); + i = 0; + } + } + + /* make sure we insert the remainder! */ + if (i) + xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn); + + spin_lock(&ailp->xa_lock); + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); +} + +/* + * Commit the given transaction to the log. * * XFS disk error handling mechanism is not based on a typical * transaction abort mechanism. Logically after the filesystem @@ -748,28 +843,16 @@ xfs_trans_unreserve_and_mod_sb( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ - /*ARGSUSED*/ int -_xfs_trans_commit( - xfs_trans_t *tp, - uint flags, - xfs_lsn_t *commit_lsn_p, - int *log_flushed) +xfs_trans_commit( + struct xfs_trans *tp, + uint flags) { - xfs_log_iovec_t *log_vector; - int nvec; - xfs_mount_t *mp; - xfs_lsn_t commit_lsn; - /* REFERENCED */ - int error; - int log_flags; - int sync; -#define XFS_TRANS_LOGVEC_COUNT 16 - xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; - void *commit_iclog; - int shutdown; - - commit_lsn = -1; + struct xfs_mount *mp = tp->t_mountp; + xfs_lsn_t commit_lsn = -1; + int error = 0; + int log_flags = 0; + int sync = tp->t_flags & XFS_TRANS_SYNC; /* * Determine whether this commit is releasing a permanent @@ -778,10 +861,7 @@ _xfs_trans_commit( if (flags & XFS_TRANS_RELEASE_LOG_RES) { ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; } - mp = tp->t_mountp; /* * If there is nothing to be logged by the transaction, @@ -790,317 +870,63 @@ _xfs_trans_commit( * Also make sure to return any reserved blocks to * the free pool. */ -shut_us_down: - shutdown = XFS_FORCED_SHUTDOWN(mp) ? EIO : 0; - if (!(tp->t_flags & XFS_TRANS_DIRTY) || shutdown) { - xfs_trans_unreserve_and_mod_sb(tp); - /* - * It is indeed possible for the transaction to be - * not dirty but the dqinfo portion to be. All that - * means is that we have some (non-persistent) quota - * reservations that need to be unreserved. - */ - XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); - if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, - NULL, log_flags); - if (commit_lsn == -1 && !shutdown) - shutdown = XFS_ERROR(EIO); - } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, shutdown? XFS_TRANS_ABORT : 0); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); - if (commit_lsn_p) - *commit_lsn_p = commit_lsn; - return (shutdown); + if (!(tp->t_flags & XFS_TRANS_DIRTY)) + goto out_unreserve; + + if (XFS_FORCED_SHUTDOWN(mp)) { + error = XFS_ERROR(EIO); + goto out_unreserve; } + ASSERT(tp->t_ticket != NULL); /* * If we need to update the superblock, then do it now. */ - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { + if (tp->t_flags & XFS_TRANS_SB_DIRTY) xfs_trans_apply_sb_deltas(tp); - } - XFS_TRANS_APPLY_DQUOT_DELTAS(mp, tp); - - /* - * Ask each log item how many log_vector entries it will - * need so we can figure out how many to allocate. - * Try to avoid the kmem_alloc() call in the common case - * by using a vector from the stack when it fits. - */ - nvec = xfs_trans_count_vecs(tp); - if (nvec == 0) { - xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); - goto shut_us_down; - } else if (nvec <= XFS_TRANS_LOGVEC_COUNT) { - log_vector = log_vector_fast; - } else { - log_vector = (xfs_log_iovec_t *)kmem_alloc(nvec * - sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - - /* - * Fill in the log_vector and pin the logged items, and - * then write the transaction to the log. - */ - xfs_trans_fill_vecs(tp, log_vector); - - error = xfs_log_write(mp, log_vector, nvec, tp->t_ticket, &(tp->t_lsn)); - - /* - * The transaction is committed incore here, and can go out to disk - * at any time after this call. However, all the items associated - * with the transaction are still locked and pinned in memory. - */ - commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); - - tp->t_commit_lsn = commit_lsn; - if (nvec > XFS_TRANS_LOGVEC_COUNT) { - kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t)); - } - - if (commit_lsn_p) - *commit_lsn_p = commit_lsn; - - /* - * If we got a log write error. Unpin the logitems that we - * had pinned, clean up, free trans structure, and return error. - */ - if (error || commit_lsn == -1) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_uncommit(tp, flags|XFS_TRANS_ABORT); - return XFS_ERROR(EIO); - } - - /* - * Once the transaction has committed, unused - * reservations need to be released and changes to - * the superblock need to be reflected in the in-core - * version. Do that now. - */ - xfs_trans_unreserve_and_mod_sb(tp); - - sync = tp->t_flags & XFS_TRANS_SYNC; - - /* - * Tell the LM to call the transaction completion routine - * when the log write with LSN commit_lsn completes (e.g. - * when the transaction commit really hits the on-disk log). - * After this call we cannot reference tp, because the call - * can happen at any time and the call will free the transaction - * structure pointed to by tp. The only case where we call - * the completion routine (xfs_trans_committed) directly is - * if the log is turned off on a debug kernel or we're - * running in simulation mode (the log is explicitly turned - * off). - */ - tp->t_logcb.cb_func = (void(*)(void*, int))xfs_trans_committed; - tp->t_logcb.cb_arg = tp; + xfs_trans_apply_dquot_deltas(tp); - /* - * We need to pass the iclog buffer which was used for the - * transaction commit record into this function, and attach - * the callback to it. The callback must be attached before - * the items are unlocked to avoid racing with other threads - * waiting for an item to unlock. - */ - shutdown = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb)); + xfs_log_commit_cil(mp, tp, &commit_lsn, flags); - /* - * Mark this thread as no longer being in a transaction - */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* - * Once all the items of the transaction have been copied - * to the in core log and the callback is attached, the - * items can be unlocked. - * - * This will free descriptors pointing to items which were - * not logged since there is nothing more to do with them. - * For items which were logged, we will keep pointers to them - * so they can be unpinned after the transaction commits to disk. - * This will also stamp each modified meta-data item with - * the commit lsn of this transaction for dependency tracking - * purposes. - */ - xfs_trans_unlock_items(tp, commit_lsn); - - /* - * If we detected a log error earlier, finish committing - * the transaction now (unpin log items, etc). - * - * Order is critical here, to avoid using the transaction - * pointer after its been freed (by xfs_trans_committed - * either here now, or as a callback). We cannot do this - * step inside xfs_log_notify as was done earlier because - * of this issue. - */ - if (shutdown) - xfs_trans_committed(tp, XFS_LI_ABORTED); - - /* - * Now that the xfs_trans_committed callback has been attached, - * and the items are released we can finally allow the iclog to - * go to disk. - */ - error = xfs_log_release_iclog(mp, commit_iclog); + xfs_trans_free(tp); /* * If the transaction needs to be synchronous, then force the * log out now and wait for it. */ if (sync) { - if (!error) { - error = _xfs_log_force(mp, commit_lsn, - XFS_LOG_FORCE | XFS_LOG_SYNC, - log_flushed); - } + error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); XFS_STATS_INC(xs_trans_sync); } else { XFS_STATS_INC(xs_trans_async); } - return (error); -} - - -/* - * Total up the number of log iovecs needed to commit this - * transaction. The transaction itself needs one for the - * transaction header. Ask each dirty item in turn how many - * it needs to get the total. - */ -STATIC uint -xfs_trans_count_vecs( - xfs_trans_t *tp) -{ - int nvecs; - xfs_log_item_desc_t *lidp; - - nvecs = 1; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - - /* In the non-debug case we need to start bailing out if we - * didn't find a log_item here, return zero and let trans_commit - * deal with it. - */ - if (lidp == NULL) - return 0; - - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - lidp->lid_size = IOP_SIZE(lidp->lid_item); - nvecs += lidp->lid_size; - lidp = xfs_trans_next_item(tp, lidp); - } - - return nvecs; -} - -/* - * Called from the trans_commit code when we notice that - * the filesystem is in the middle of a forced shutdown. - */ -STATIC void -xfs_trans_uncommit( - xfs_trans_t *tp, - uint flags) -{ - xfs_log_item_desc_t *lidp; - - for (lidp = xfs_trans_first_item(tp); - lidp != NULL; - lidp = xfs_trans_next_item(tp, lidp)) { - /* - * Unpin all but those that aren't dirty. - */ - if (lidp->lid_flags & XFS_LID_DIRTY) - IOP_UNPIN_REMOVE(lidp->lid_item, tp); - } + return error; +out_unreserve: xfs_trans_unreserve_and_mod_sb(tp); - XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(tp->t_mountp, tp); - - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); - xfs_trans_free(tp); -} - -/* - * Fill in the vector with pointers to data to be logged - * by this transaction. The transaction header takes - * the first vector, and then each dirty item takes the - * number of vectors it indicated it needed in xfs_trans_count_vecs(). - * - * As each item fills in the entries it needs, also pin the item - * so that it cannot be flushed out until the log write completes. - */ -STATIC void -xfs_trans_fill_vecs( - xfs_trans_t *tp, - xfs_log_iovec_t *log_vector) -{ - xfs_log_item_desc_t *lidp; - xfs_log_iovec_t *vecp; - uint nitems; /* - * Skip over the entry for the transaction header, we'll - * fill that in at the end. + * It is indeed possible for the transaction to be not dirty but + * the dqinfo portion to be. All that means is that we have some + * (non-persistent) quota reservations that need to be unreserved. */ - vecp = log_vector + 1; /* pointer arithmetic */ - - nitems = 0; - lidp = xfs_trans_first_item(tp); - ASSERT(lidp != NULL); - while (lidp != NULL) { - /* - * Skip items which aren't dirty in this transaction. - */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) { - lidp = xfs_trans_next_item(tp, lidp); - continue; - } - /* - * The item may be marked dirty but not log anything. - * This can be used to get called when a transaction - * is committed. - */ - if (lidp->lid_size) { - nitems++; - } - IOP_FORMAT(lidp->lid_item, vecp); - vecp += lidp->lid_size; /* pointer arithmetic */ - IOP_PIN(lidp->lid_item); - lidp = xfs_trans_next_item(tp, lidp); + xfs_trans_unreserve_and_mod_dquots(tp); + if (tp->t_ticket) { + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + if (commit_lsn == -1 && !error) + error = XFS_ERROR(EIO); } + current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free(tp); - /* - * Now that we've counted the number of items in this - * transaction, fill in the transaction header. - */ - tp->t_header.th_magic = XFS_TRANS_HEADER_MAGIC; - tp->t_header.th_type = tp->t_type; - tp->t_header.th_num_items = nitems; - log_vector->i_addr = (xfs_caddr_t)&tp->t_header; - log_vector->i_len = sizeof(xfs_trans_header_t); - XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR); + XFS_STATS_INC(xs_trans_empty); + return error; } - /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -1115,12 +941,6 @@ xfs_trans_cancel( int flags) { int log_flags; -#ifdef DEBUG - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; -#endif xfs_mount_t *mp = tp->t_mountp; /* @@ -1139,25 +959,15 @@ xfs_trans_cancel( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT)) { - licp = &(tp->t_items); - while (licp != NULL) { - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - lip = lidp->lid_item; - if (!XFS_FORCED_SHUTDOWN(mp)) - ASSERT(!(lip->li_type == XFS_LI_EFD)); - } - licp = licp->lic_next; - } + if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + struct xfs_log_item_desc *lidp; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) + ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); - XFS_TRANS_UNRESERVE_AND_MOD_DQUOTS(mp, tp); + xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { if (flags & XFS_TRANS_RELEASE_LOG_RES) { @@ -1172,193 +982,75 @@ xfs_trans_cancel( /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, flags); - xfs_trans_free_busy(tp); + xfs_trans_free_items(tp, NULLCOMMITLSN, flags); xfs_trans_free(tp); } - -/* - * Free the transaction structure. If there is more clean up - * to do when the structure is freed, add it here. - */ -STATIC void -xfs_trans_free( - xfs_trans_t *tp) -{ - atomic_dec(&tp->t_mountp->m_active_trans); - XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); - kmem_zone_free(xfs_trans_zone, tp); -} - - /* - * THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item(). - * - * This is typically called by the LM when a transaction has been fully - * committed to disk. It needs to unpin the items which have - * been logged by the transaction and update their positions - * in the AIL if necessary. - * This also gets called when the transactions didn't get written out - * because of an I/O error. Abortflag & XFS_LI_ABORTED is set then. - * - * Call xfs_trans_chunk_committed() to process the items in - * each chunk. + * Roll from one trans in the sequence of PERMANENT transactions to + * the next: permanent transactions are only flushed out when + * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * as possible to let chunks of it go to the log. So we commit the + * chunk we've been working on and get a new transaction to continue. */ -STATIC void -xfs_trans_committed( - xfs_trans_t *tp, - int abortflag) +int +xfs_trans_roll( + struct xfs_trans **tpp, + struct xfs_inode *dp) { - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i; + struct xfs_trans *trans; + struct xfs_trans_res tres; + int error; /* - * Call the transaction's completion callback if there - * is one. + * Ensure that the inode is always logged. */ - if (tp->t_callback != NULL) { - tp->t_callback(tp, tp->t_callarg); - } + trans = *tpp; + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* - * Special case the chunk embedded in the transaction. + * Copy the critical parameters from one trans to the next. */ - licp = &(tp->t_items); - if (!(XFS_LIC_ARE_ALL_FREE(licp))) { - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - } + tres.tr_logres = trans->t_log_res; + tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); /* - * Process the items in each chunk in turn. + * Commit the current transaction. + * If this commit failed, then it'd just unlock those items that + * are not marked ihold. That also means that a filesystem shutdown + * is in progress. The caller takes the responsibility to cancel + * the duplicate transaction that gets returned. */ - licp = licp->lic_next; - while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); - xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag); - next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); - licp = next_licp; - } + error = xfs_trans_commit(trans, 0); + if (error) + return (error); - /* - * Clear all the per-AG busy list items listed in this transaction - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - for (i = 0, lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) { - if (!XFS_LBC_ISFREE(lbcp, i)) { - xfs_alloc_clear_busy(tp, lbsp->lbc_ag, - lbsp->lbc_idx); - } - } - lbcp = lbcp->lbc_next; - } - xfs_trans_free_busy(tp); + trans = *tpp; /* - * That's it for the transaction structure. Free it. + * transaction commit worked ok so we can drop the extra ticket + * reference that we gained in xfs_trans_dup() */ - xfs_trans_free(tp); -} - -/* - * This is called to perform the commit processing for each - * item described by the given chunk. - * - * The commit processing consists of unlocking items which were - * held locked with the SYNC_UNLOCK attribute, calling the committed - * routine of each logged item, updating the item's position in the AIL - * if necessary, and unpinning each item. If the committed routine - * returns -1, then do nothing further with the item because it - * may have been freed. - * - * Since items are unlocked when they are copied to the incore - * log, it is possible for two transactions to be completing - * and manipulating the same item simultaneously. The AIL lock - * will protect the lsn field of each item. The value of this - * field can never go backwards. - * - * We unpin the items after repositioning them in the AIL, because - * otherwise they could be immediately flushed and we'd have to race - * with the flusher trying to pull the item from the AIL as we add it. - */ -STATIC void -xfs_trans_chunk_committed( - xfs_log_item_chunk_t *licp, - xfs_lsn_t lsn, - int aborted) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - xfs_lsn_t item_lsn; - struct xfs_mount *mp; - int i; - SPLDECL(s); - - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } + xfs_log_ticket_put(trans->t_ticket); - lip = lidp->lid_item; - if (aborted) - lip->li_flags |= XFS_LI_ABORTED; - - /* - * Send in the ABORTED flag to the COMMITTED routine - * so that it knows whether the transaction was aborted - * or not. - */ - item_lsn = IOP_COMMITTED(lip, lsn); - - /* - * If the committed routine returns -1, make - * no more references to the item. - */ - if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) { - continue; - } - /* - * If the returned lsn is greater than what it - * contained before, update the location of the - * item in the AIL. If it is not, then do nothing. - * Items can never move backwards in the AIL. - * - * While the new lsn should usually be greater, it - * is possible that a later transaction completing - * simultaneously with an earlier one using the - * same item could complete first with a higher lsn. - * This would cause the earlier transaction to fail - * the test below. - */ - mp = lip->li_mountp; - AIL_LOCK(mp,s); - if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) { - /* - * This will set the item's lsn to item_lsn - * and update the position of the item in - * the AIL. - * - * xfs_trans_update_ail() drops the AIL lock. - */ - xfs_trans_update_ail(mp, lip, item_lsn, s); - } else { - AIL_UNLOCK(mp, s); - } + /* + * Reserve space in the log for th next transaction. + * This also pushes items in the "AIL", the list of logged items, + * out to disk if they are taking up space at the tail of the log + * that we want to use. This requires that either nothing be locked + * across this call, or that anything that is locked be logged in + * the prior and the next transactions. + */ + tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; + error = xfs_trans_reserve(trans, &tres, 0, 0); + /* + * Ensure that the inode is in the new transaction and locked. + */ + if (error) + return error; - /* - * Now that we've repositioned the item in the AIL, - * unpin it so it can be flushed. Pass information - * about buffer stale state down from the log item - * flags, if anyone else stales the buffer we do not - * want to pay any attention to it. - */ - IOP_UNPIN(lip, lidp->lid_flags & XFS_LID_BUF_STALE); - } + xfs_trans_ijoin(trans, dp, 0); + return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9dc88b38060..b5bc1ab3c4d 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -18,91 +18,8 @@ #ifndef __XFS_TRANS_H__ #define __XFS_TRANS_H__ -/* - * This is the structure written in the log at the head of - * every transaction. It identifies the type and id of the - * transaction, and contains the number of items logged by - * the transaction so we know how many to expect during recovery. - * - * Do not change the below structure without redoing the code in - * xlog_recover_add_to_trans() and xlog_recover_add_to_cont_trans(). - */ -typedef struct xfs_trans_header { - uint th_magic; /* magic number */ - uint th_type; /* transaction type */ - __int32_t th_tid; /* transaction id (unused) */ - uint th_num_items; /* num items logged by trans */ -} xfs_trans_header_t; - -#define XFS_TRANS_HEADER_MAGIC 0x5452414e /* TRAN */ - -/* - * Log item types. - */ -#define XFS_LI_5_3_BUF 0x1234 /* v1 bufs, 1-block inode buffers */ -#define XFS_LI_5_3_INODE 0x1235 /* 1-block inode buffers */ -#define XFS_LI_EFI 0x1236 -#define XFS_LI_EFD 0x1237 -#define XFS_LI_IUNLINK 0x1238 -#define XFS_LI_6_1_INODE 0x1239 /* 4K non-aligned inode bufs */ -#define XFS_LI_6_1_BUF 0x123a /* v1, 4K inode buffers */ -#define XFS_LI_INODE 0x123b /* aligned ino chunks, var-size ibufs */ -#define XFS_LI_BUF 0x123c /* v2 bufs, variable sized inode bufs */ -#define XFS_LI_DQUOT 0x123d -#define XFS_LI_QUOTAOFF 0x123e - -/* - * Transaction types. Used to distinguish types of buffers. - */ -#define XFS_TRANS_SETATTR_NOT_SIZE 1 -#define XFS_TRANS_SETATTR_SIZE 2 -#define XFS_TRANS_INACTIVE 3 -#define XFS_TRANS_CREATE 4 -#define XFS_TRANS_CREATE_TRUNC 5 -#define XFS_TRANS_TRUNCATE_FILE 6 -#define XFS_TRANS_REMOVE 7 -#define XFS_TRANS_LINK 8 -#define XFS_TRANS_RENAME 9 -#define XFS_TRANS_MKDIR 10 -#define XFS_TRANS_RMDIR 11 -#define XFS_TRANS_SYMLINK 12 -#define XFS_TRANS_SET_DMATTRS 13 -#define XFS_TRANS_GROWFS 14 -#define XFS_TRANS_STRAT_WRITE 15 -#define XFS_TRANS_DIOSTRAT 16 -#define XFS_TRANS_WRITE_SYNC 17 -#define XFS_TRANS_WRITEID 18 -#define XFS_TRANS_ADDAFORK 19 -#define XFS_TRANS_ATTRINVAL 20 -#define XFS_TRANS_ATRUNCATE 21 -#define XFS_TRANS_ATTR_SET 22 -#define XFS_TRANS_ATTR_RM 23 -#define XFS_TRANS_ATTR_FLAG 24 -#define XFS_TRANS_CLEAR_AGI_BUCKET 25 -#define XFS_TRANS_QM_SBCHANGE 26 -/* - * Dummy entries since we use the transaction type to index into the - * trans_type[] in xlog_recover_print_trans_head() - */ -#define XFS_TRANS_DUMMY1 27 -#define XFS_TRANS_DUMMY2 28 -#define XFS_TRANS_QM_QUOTAOFF 29 -#define XFS_TRANS_QM_DQALLOC 30 -#define XFS_TRANS_QM_SETQLIM 31 -#define XFS_TRANS_QM_DQCLUSTER 32 -#define XFS_TRANS_QM_QINOCREATE 33 -#define XFS_TRANS_QM_QUOTAOFF_END 34 -#define XFS_TRANS_SB_UNIT 35 -#define XFS_TRANS_FSYNC_TS 36 -#define XFS_TRANS_GROWFSRT_ALLOC 37 -#define XFS_TRANS_GROWFSRT_ZERO 38 -#define XFS_TRANS_GROWFSRT_FREE 39 -#define XFS_TRANS_SWAPEXT 40 -#define XFS_TRANS_TYPE_MAX 40 -/* new transaction types need to be reflected in xfs_logprint(8) */ - +/* kernel only transaction subsystem defines */ -#ifdef __KERNEL__ struct xfs_buf; struct xfs_buftarg; struct xfs_efd_log_item; @@ -110,22 +27,19 @@ struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; struct xfs_log_iovec; -struct xfs_log_item; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_trans_res; struct xfs_dquot_acct; - -typedef struct xfs_ail_entry { - struct xfs_log_item *ail_forw; /* AIL forw pointer */ - struct xfs_log_item *ail_back; /* AIL back pointer */ -} xfs_ail_entry_t; +struct xfs_busy_extent; typedef struct xfs_log_item { - xfs_ail_entry_t li_ail; /* AIL pointers */ + struct list_head li_ail; /* AIL pointers */ xfs_lsn_t li_lsn; /* last on-disk lsn */ struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ + struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ struct xfs_log_item *li_bio_list; /* buffer item list */ @@ -133,211 +47,49 @@ typedef struct xfs_log_item { struct xfs_log_item *); /* buffer item iodone */ /* callback func */ - struct xfs_item_ops *li_ops; /* function list */ + const struct xfs_item_ops *li_ops; /* function list */ + + /* delayed logging */ + struct list_head li_cil; /* CIL pointers */ + struct xfs_log_vec *li_lv; /* active log vector */ + xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 #define XFS_LI_ABORTED 0x2 -typedef struct xfs_item_ops { - uint (*iop_size)(xfs_log_item_t *); - void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); +#define XFS_LI_FLAGS \ + { XFS_LI_IN_AIL, "IN_AIL" }, \ + { XFS_LI_ABORTED, "ABORTED" } + +struct xfs_item_ops { + void (*iop_size)(xfs_log_item_t *, int *, int *); + void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *); void (*iop_pin)(xfs_log_item_t *); - void (*iop_unpin)(xfs_log_item_t *, int); - void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *); - uint (*iop_trylock)(xfs_log_item_t *); + void (*iop_unpin)(xfs_log_item_t *, int remove); + uint (*iop_push)(struct xfs_log_item *, struct list_head *); void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); - void (*iop_push)(xfs_log_item_t *); - void (*iop_abort)(xfs_log_item_t *); - void (*iop_pushbuf)(xfs_log_item_t *); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); -} xfs_item_ops_t; - -#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip) -#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) -#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) -#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags) -#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp) -#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) -#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) -#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) -#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip) -#define IOP_ABORT(ip) (*(ip)->li_ops->iop_abort)(ip) -#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip) -#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) - -/* - * Return values for the IOP_TRYLOCK() routines. - */ -#define XFS_ITEM_SUCCESS 0 -#define XFS_ITEM_PINNED 1 -#define XFS_ITEM_LOCKED 2 -#define XFS_ITEM_FLUSHING 3 -#define XFS_ITEM_PUSHBUF 4 - -#endif /* __KERNEL__ */ - -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -typedef struct xfs_log_item_desc { - xfs_log_item_t *lid_item; - ushort lid_size; - unsigned char lid_flags; - unsigned char lid_index; -} xfs_log_item_desc_t; - -#define XFS_LID_DIRTY 0x1 -#define XFS_LID_PINNED 0x2 -#define XFS_LID_BUF_STALE 0x8 - -/* - * This structure is used to maintain a chunk list of log_item_desc - * structures. The free field is a bitmask indicating which descriptors - * in this chunk's array are free. The unused field is the first value - * not used since this chunk was allocated. - */ -#define XFS_LIC_NUM_SLOTS 15 -typedef struct xfs_log_item_chunk { - struct xfs_log_item_chunk *lic_next; - ushort lic_free; - ushort lic_unused; - xfs_log_item_desc_t lic_descs[XFS_LIC_NUM_SLOTS]; -} xfs_log_item_chunk_t; - -#define XFS_LIC_MAX_SLOT (XFS_LIC_NUM_SLOTS - 1) -#define XFS_LIC_FREEMASK ((1 << XFS_LIC_NUM_SLOTS) - 1) - - -/* - * Initialize the given chunk. Set the chunk's free descriptor mask - * to indicate that all descriptors are free. The caller gets to set - * lic_unused to the right value (0 matches all free). The - * lic_descs.lid_index values are set up as each desc is allocated. - */ -#define XFS_LIC_INIT(cp) xfs_lic_init(cp) -static inline void xfs_lic_init(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot) -static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_descs[slot].lid_index = (unsigned char)(slot); -} - -#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp) -static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp) -{ - return cp->lic_free & XFS_LIC_FREEMASK; -} - -#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp) -static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp) -{ - cp->lic_free = XFS_LIC_FREEMASK; -} - -#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp) -static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp) -{ - return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK); -} - -#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot) -static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot) -{ - return (cp->lic_free & (1 << slot)); -} +}; -#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot) -static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free &= ~(1 << slot); -} - -#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot) -static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot) -{ - cp->lic_free |= 1 << slot; -} - -#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot) -static inline xfs_log_item_desc_t * -xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot) -{ - return &(cp->lic_descs[slot]); -} - -#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp) -static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp) -{ - return (uint)dp->lid_index; -} +void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, + int type, const struct xfs_item_ops *ops); /* - * Calculate the address of a chunk given a descriptor pointer: - * dp - dp->lid_index give the address of the start of the lic_descs array. - * From this we subtract the offset of the lic_descs field in a chunk. - * All of this yields the address of the chunk, which is - * cast to a chunk pointer. + * Return values for the iop_push() routines. */ -#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp) -static inline xfs_log_item_chunk_t * -xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp) -{ - return (xfs_log_item_chunk_t*) \ - (((xfs_caddr_t)((dp) - (dp)->lid_index)) - \ - (xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs)); -} - -#ifdef __KERNEL__ -/* - * This structure is used to maintain a list of block ranges that have been - * freed in the transaction. The ranges are listed in the perag[] busy list - * between when they're freed and the transaction is committed to disk. - */ - -typedef struct xfs_log_busy_slot { - xfs_agnumber_t lbc_ag; - ushort lbc_idx; /* index in perag.busy[] */ -} xfs_log_busy_slot_t; - -#define XFS_LBC_NUM_SLOTS 31 -typedef struct xfs_log_busy_chunk { - struct xfs_log_busy_chunk *lbc_next; - uint lbc_free; /* free slots bitmask */ - ushort lbc_unused; /* first unused */ - xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS]; -} xfs_log_busy_chunk_t; +#define XFS_ITEM_SUCCESS 0 +#define XFS_ITEM_PINNED 1 +#define XFS_ITEM_LOCKED 2 +#define XFS_ITEM_FLUSHING 3 -#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1) -#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1) - -#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK) -#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot))) -#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)])) -#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK) -#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot))) - -/* - * This is the type of function which can be given to xfs_trans_callback() - * to be called upon the transaction's commit to disk. - */ -typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); /* * This is the structure maintained for every active transaction. */ typedef struct xfs_trans { unsigned int t_magic; /* magic number */ - xfs_log_callback_t t_logcb; /* log callback struct */ unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ @@ -345,589 +97,38 @@ typedef struct xfs_trans { unsigned int t_blk_res_used; /* # of resvd blocks used */ unsigned int t_rtx_res; /* # of rt extents resvd */ unsigned int t_rtx_res_used; /* # of resvd rt extents used */ - xfs_log_ticket_t t_ticket; /* log mgr ticket */ - sema_t t_sema; /* sema for commit completion */ + struct xlog_ticket *t_ticket; /* log mgr ticket */ xfs_lsn_t t_lsn; /* log seq num of start of * transaction. */ xfs_lsn_t t_commit_lsn; /* log seq num of end of * transaction. */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ - xfs_trans_callback_t t_callback; /* transaction callback */ - void *t_callarg; /* callback arg */ unsigned int t_flags; /* misc flags */ - long t_icount_delta; /* superblock icount change */ - long t_ifree_delta; /* superblock ifree change */ - long t_fdblocks_delta; /* superblock fdblocks chg */ - long t_res_fdblocks_delta; /* on-disk only chg */ - long t_frextents_delta;/* superblock freextents chg*/ - long t_res_frextents_delta; /* on-disk only chg */ -#ifdef DEBUG - long t_ag_freeblks_delta; /* debugging counter */ - long t_ag_flist_delta; /* debugging counter */ - long t_ag_btree_delta; /* debugging counter */ + int64_t t_icount_delta; /* superblock icount change */ + int64_t t_ifree_delta; /* superblock ifree change */ + int64_t t_fdblocks_delta; /* superblock fdblocks chg */ + int64_t t_res_fdblocks_delta; /* on-disk only chg */ + int64_t t_frextents_delta;/* superblock freextents chg*/ + int64_t t_res_frextents_delta; /* on-disk only chg */ +#if defined(DEBUG) || defined(XFS_WARN) + int64_t t_ag_freeblks_delta; /* debugging counter */ + int64_t t_ag_flist_delta; /* debugging counter */ + int64_t t_ag_btree_delta; /* debugging counter */ #endif - long t_dblocks_delta;/* superblock dblocks change */ - long t_agcount_delta;/* superblock agcount change */ - long t_imaxpct_delta;/* superblock imaxpct change */ - long t_rextsize_delta;/* superblock rextsize chg */ - long t_rbmblocks_delta;/* superblock rbmblocks chg */ - long t_rblocks_delta;/* superblock rblocks change */ - long t_rextents_delta;/* superblocks rextents chg */ - long t_rextslog_delta;/* superblocks rextslog chg */ - unsigned int t_items_free; /* log item descs free */ - xfs_log_item_chunk_t t_items; /* first log item desc chunk */ - xfs_trans_header_t t_header; /* header for in-log trans */ - unsigned int t_busy_free; /* busy descs free */ - xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ + int64_t t_dblocks_delta;/* superblock dblocks change */ + int64_t t_agcount_delta;/* superblock agcount change */ + int64_t t_imaxpct_delta;/* superblock imaxpct change */ + int64_t t_rextsize_delta;/* superblock rextsize chg */ + int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */ + int64_t t_rblocks_delta;/* superblock rblocks change */ + int64_t t_rextents_delta;/* superblocks rextents chg */ + int64_t t_rextslog_delta;/* superblocks rextslog chg */ + struct list_head t_items; /* log item descriptors */ + struct list_head t_busy; /* list of busy extents */ unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; -#endif /* __KERNEL__ */ - - -#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */ -/* - * Values for t_flags. - */ -#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */ -#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */ -#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */ -#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */ -#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */ -#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */ - -/* - * Values for call flags parameter. - */ -#define XFS_TRANS_NOSLEEP 0x1 -#define XFS_TRANS_WAIT 0x2 -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* - * Field values for xfs_trans_mod_sb. - */ -#define XFS_TRANS_SB_ICOUNT 0x00000001 -#define XFS_TRANS_SB_IFREE 0x00000002 -#define XFS_TRANS_SB_FDBLOCKS 0x00000004 -#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008 -#define XFS_TRANS_SB_FREXTENTS 0x00000010 -#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020 -#define XFS_TRANS_SB_DBLOCKS 0x00000040 -#define XFS_TRANS_SB_AGCOUNT 0x00000080 -#define XFS_TRANS_SB_IMAXPCT 0x00000100 -#define XFS_TRANS_SB_REXTSIZE 0x00000200 -#define XFS_TRANS_SB_RBMBLOCKS 0x00000400 -#define XFS_TRANS_SB_RBLOCKS 0x00000800 -#define XFS_TRANS_SB_REXTENTS 0x00001000 -#define XFS_TRANS_SB_REXTSLOG 0x00002000 - - -/* - * Various log reservation values. - * These are based on the size of the file system block - * because that is what most transactions manipulate. - * Each adds in an additional 128 bytes per item logged to - * try to account for the overhead of the transaction mechanism. - * - * Note: - * Most of the reservations underestimate the number of allocation - * groups into which they could free extents in the xfs_bmap_finish() - * call. This is because the number in the worst case is quite high - * and quite unusual. In order to fix this we need to change - * xfs_bmap_finish() to free extents in only a single AG at a time. - * This will require changes to the EFI code as well, however, so that - * the EFI for the extents not freed is logged again in each transaction. - * See bug 261917. - */ - -/* - * Per-extent log reservation for the allocation btree changes - * involved in freeing or allocating an extent. - * 2 trees * (2 blocks/level * max depth - 1) * block size - */ -#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) -#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) - -/* - * Per-directory log reservation for any directory change. - * dir blocks: (1 btree block per level + data block + free block) * dblock size - * bmap btree: (levels + 2) * max depth * block size - * v2 directory blocks can be fragmented below the dirblksize down to the fsb - * size, so account for that in the DAENTER macros. - */ -#define XFS_DIROP_LOG_RES(mp) \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ - (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) -#define XFS_DIROP_LOG_COUNT(mp) \ - (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ - XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) - -/* - * In a write transaction we can allocate a maximum of 2 - * extents. This gives: - * the inode getting the new extents: inode size - * the inode\'s bmap btree: max depth * block size - * the agfs of the ags from which the extents are allocated: 2 * sector - * the superblock free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: - * the agfs of the ags containing the blocks: 2 * sector size - * the agfls of the ags containing the blocks: 2 * sector size - * the super block free block counter: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_WRITE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - -#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) - -/* - * In truncating a file we free up to two extents at once. We can modify: - * the inode being truncated: inode size - * the inode\'s bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \ - (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - -#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) - -/* - * In renaming a files we can modify: - * the four inodes involved: 4 * inode size - * the two directory btrees: 2 * (max depth + v2) * dir block size - * the two directory bmap btrees: 2 * max depth * block size - * And the bmap_finish transaction can free dir and bmap blocks (two sets - * of bmap blocks) giving: - * the agf for the ags in which the blocks live: 3 * sector size - * the agfl for the ags in which the blocks live: 3 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_RENAME_LOG_RES(mp) \ - (MAX( \ - ((4 * (mp)->m_sb.sb_inodesize) + \ - (2 * XFS_DIROP_LOG_RES(mp)) + \ - (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \ - ((3 * (mp)->m_sb.sb_sectsize) + \ - (3 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 3) + \ - (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3)))))) - -#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) - -/* - * For creating a link to an inode: - * the parent directory inode: inode size - * the linked inode: inode size - * the directory btree could split: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free some bmap blocks giving: - * the agf for the ag in which the blocks live: sector size - * the agfl for the ag in which the blocks live: sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_LINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - -#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) - -/* - * For removing a directory entry we can modify: - * the parent directory inode: inode size - * the removed inode: inode size - * the directory btree could join: (max depth + v2) * dir block size - * the directory bmap btree could join or split: (max depth + v2) * blocksize - * And the bmap_finish transaction can free the dir and bmap blocks giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_REMOVE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - -#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) - -/* - * For symlink we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: 1 block - * the directory btree: (max depth + v2) * dir block size - * the directory inode\'s bmap btree: (max depth + v2) * block size - * the blocks for the symlink: 1 KB - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_SYMLINK_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - 1024 + \ - (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - -#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) - -/* - * For create we can modify: - * the parent directory inode: inode size - * the new inode: inode size - * the inode btree entry: block size - * the superblock for the nlink flag: sector size - * the directory btree: (max depth + v2) * dir block size - * the directory inode\'s bmap btree: (max depth + v2) * block size - * Or in the first xact we allocate some inodes giving: - * the agi and agf of the ag getting the new inodes: 2 * sectorsize - * the superblock for the nlink flag: sector size - * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_CREATE_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B(mp, 1) + \ - XFS_DIROP_LOG_RES(mp) + \ - (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \ - (3 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \ - XFS_FSB_TO_B((mp), XFS_IN_MAXLEVELS(mp)) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))))) - -#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) - -/* - * Making a new directory is the same as creating a new file. - */ -#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp) - -#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) - -/* - * In freeing an inode we can modify: - * the inode being freed: inode size - * the super block free inode counter: sector size - * the agi hash list and counters: sector size - * the inode btree entry: block size - * the on disk inode before ours in the agi hash list: inode cluster size - * the inode btree: max depth * blocksize - * the allocation btrees: 2 trees * (max depth - 1) * block size - */ -#define XFS_CALC_IFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), 1) + \ - MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ - (128 * 5) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - - -#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) - -/* - * When only changing the inode we log the inode and possibly the superblock - * We also add a bit of slop for the transaction stuff. - */ -#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + 512) - -#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) - -/* - * Growing the data section of the filesystem. - * superblock - * agi and agf - * allocation btrees - */ -#define XFS_CALC_GROWDATA_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize * 3 + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - -#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) - -/* - * Growing the rt section of the filesystem. - * In the first set of transactions (ALLOC) we allocate space to the - * bitmap or summary files. - * superblock: sector size - * agf of the ag from which the extent is allocated: sector size - * bmap btree for bitmap/summary inode: max depth * blocksize - * bitmap/summary inode: inode size - * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize - */ -#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \ - (2 * (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \ - (mp)->m_sb.sb_inodesize + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * \ - (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - -#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) - -/* - * Growing the rt section of the filesystem. - * In the second set of transactions (ZERO) we zero the new metadata blocks. - * one bitmap/summary block: blocksize - */ -#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \ - ((mp)->m_sb.sb_blocksize + 128) - -#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) - -/* - * Growing the rt section of the filesystem. - * In the third set of transactions (FREE) we update metadata without - * allocating any new blocks. - * superblock: sector size - * bitmap inode: inode size - * summary inode: inode size - * one bitmap block: blocksize - * summary blocks: new summary size - */ -#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + \ - 2 * (mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_blocksize + \ - (mp)->m_rsumsize + \ - (128 * 5)) - -#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) - -/* - * Logging the inode modification timestamp on a synchronous write. - * inode - */ -#define XFS_CALC_SWRITE_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - -#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Logging the inode timestamps on an fsync -- same as SWRITE - * as long as SWRITE logs the entire inode core - */ -#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Logging the inode mode bits when writing a setuid/setgid file - * inode - */ -#define XFS_CALC_WRITEID_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + 128) - -#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) - -/* - * Converting the inode from non-attributed to attributed. - * the inode being converted: inode size - * agf block and superblock (for block allocation) - * the new block (directory sized) - * bmap blocks for the new directory block - * allocation btrees - */ -#define XFS_CALC_ADDAFORK_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize * 2 + \ - (mp)->m_dirblksize + \ - XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \ - XFS_ALLOCFREE_LOG_RES(mp, 1) + \ - (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \ - XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) - -#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) - -/* - * Removing the attribute fork of a file - * the inode being truncated: inode size - * the inode\'s bmap btree: max depth * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: - * the agf for each of the ags: 4 * sector size - * the agfl for each of the ags: 4 * sector size - * the super block to reflect the freed blocks: sector size - * worst case split in allocation btrees per extent assuming 4 extents: - * 4 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \ - ((4 * (mp)->m_sb.sb_sectsize) + \ - (4 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 4) + \ - (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)))))) - -#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) - -/* - * Setting an attribute. - * the inode getting the attribute - * the superblock for allocations - * the agfs extents are allocated from - * the attribute btree * max depth - * the inode allocation btree - * Since attribute transaction space is dependent on the size of the attribute, - * the calculation is done partially at mount time and partially at runtime. - */ -#define XFS_CALC_ATTRSET_LOG_RES(mp) \ - ((mp)->m_sb.sb_inodesize + \ - (mp)->m_sb.sb_sectsize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - (128 * (2 + XFS_DA_NODE_MAXDEPTH))) - -#define XFS_ATTRSET_LOG_RES(mp, ext) \ - ((mp)->m_reservations.tr_attrset + \ - (ext * (mp)->m_sb.sb_sectsize) + \ - (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ - (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) - -/* - * Removing an attribute. - * the inode: inode size - * the attribute btree could join: max depth * block size - * the inode bmap btree could join or split: max depth * block size - * And the bmap_finish transaction can free the attr blocks freed giving: - * the agf for the ag in which the blocks live: 2 * sector size - * the agfl for the ag in which the blocks live: 2 * sector size - * the superblock for the free block count: sector size - * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - */ -#define XFS_CALC_ATTRRM_LOG_RES(mp) \ - (MAX( \ - ((mp)->m_sb.sb_inodesize + \ - XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \ - XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \ - (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \ - ((2 * (mp)->m_sb.sb_sectsize) + \ - (2 * (mp)->m_sb.sb_sectsize) + \ - (mp)->m_sb.sb_sectsize + \ - XFS_ALLOCFREE_LOG_RES(mp, 2) + \ - (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))))) - -#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) - -/* - * Clearing a bad agino number in an agi hash bucket. - */ -#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \ - ((mp)->m_sb.sb_sectsize + 128) - -#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) - - -/* - * Various log count values. - */ -#define XFS_DEFAULT_LOG_COUNT 1 -#define XFS_DEFAULT_PERM_LOG_COUNT 2 -#define XFS_ITRUNCATE_LOG_COUNT 2 -#define XFS_INACTIVE_LOG_COUNT 2 -#define XFS_CREATE_LOG_COUNT 2 -#define XFS_MKDIR_LOG_COUNT 3 -#define XFS_SYMLINK_LOG_COUNT 3 -#define XFS_REMOVE_LOG_COUNT 2 -#define XFS_LINK_LOG_COUNT 2 -#define XFS_RENAME_LOG_COUNT 2 -#define XFS_WRITE_LOG_COUNT 2 -#define XFS_ADDAFORK_LOG_COUNT 2 -#define XFS_ATTRINVAL_LOG_COUNT 1 -#define XFS_ATTRSET_LOG_COUNT 3 -#define XFS_ATTRRM_LOG_COUNT 3 - -/* - * Here we centralize the specification of XFS meta-data buffer - * reference count values. This determine how hard the buffer - * cache tries to hold onto the buffer. - */ -#define XFS_AGF_REF 4 -#define XFS_AGI_REF 4 -#define XFS_AGFL_REF 3 -#define XFS_INO_BTREE_REF 3 -#define XFS_ALLOC_BTREE_REF 2 -#define XFS_BMAP_BTREE_REF 2 -#define XFS_DIR_BTREE_REF 2 -#define XFS_ATTR_BTREE_REF 1 -#define XFS_INO_REF 1 -#define XFS_DQUOT_REF 1 - -#ifdef __KERNEL__ /* * XFS transaction mechanism exported interfaces that are * actually macros. @@ -937,10 +138,10 @@ typedef struct xfs_trans { #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) -#ifdef DEBUG -#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (long)d) -#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (long)d) -#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (long)d) +#if defined(DEBUG) || defined(XFS_WARN) +#define xfs_trans_agblocks_delta(tp, d) ((tp)->t_ag_freeblks_delta += (int64_t)d) +#define xfs_trans_agflist_delta(tp, d) ((tp)->t_ag_flist_delta += (int64_t)d) +#define xfs_trans_agbtree_delta(tp, d) ((tp)->t_ag_btree_delta += (int64_t)d) #else #define xfs_trans_agblocks_delta(tp, d) #define xfs_trans_agflist_delta(tp, d) @@ -950,18 +151,54 @@ typedef struct xfs_trans { /* * XFS transaction mechanism exported interfaces. */ -void xfs_trans_init(struct xfs_mount *); xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); -xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint); +xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); xfs_trans_t *xfs_trans_dup(xfs_trans_t *); -int xfs_trans_reserve(xfs_trans_t *, uint, uint, uint, +int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); -void xfs_trans_mod_sb(xfs_trans_t *, uint, long); -struct xfs_buf *xfs_trans_get_buf(xfs_trans_t *, struct xfs_buftarg *, xfs_daddr_t, - int, uint); -int xfs_trans_read_buf(struct xfs_mount *, xfs_trans_t *, - struct xfs_buftarg *, xfs_daddr_t, int, uint, - struct xfs_buf **); +void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); + +struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + uint flags); + +static inline struct xfs_buf * +xfs_trans_get_buf( + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + uint flags) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_get_buf_map(tp, target, &map, 1, flags); +} + +int xfs_trans_read_buf_map(struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); + +static inline int +xfs_trans_read_buf( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + xfs_daddr_t blkno, + int numblks, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) +{ + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + return xfs_trans_read_buf_map(mp, tp, target, &map, 1, + flags, bpp, ops); +} + struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int); void xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *); @@ -971,12 +208,11 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); -int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, - xfs_ino_t , uint, uint, struct xfs_inode **); -void xfs_trans_ijoin(xfs_trans_t *, struct xfs_inode *, uint); -void xfs_trans_ihold(xfs_trans_t *, struct xfs_inode *); +void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); +void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); struct xfs_efi_log_item *xfs_trans_get_efi(xfs_trans_t *, uint); @@ -992,22 +228,18 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int _xfs_trans_commit(xfs_trans_t *, - uint flags, - xfs_lsn_t *, - int *); -#define xfs_trans_commit(tp, flags, lsn) \ - _xfs_trans_commit(tp, flags, lsn, NULL) +int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *, int); -void xfs_trans_ail_init(struct xfs_mount *); -xfs_lsn_t xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t); -xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *); -void xfs_trans_unlocked_item(struct xfs_mount *, - xfs_log_item_t *); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); +int xfs_trans_ail_init(struct xfs_mount *); +void xfs_trans_ail_destroy(struct xfs_mount *); + +void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, + enum xfs_blft); +void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, + struct xfs_buf *src_bp); -#endif /* __KERNEL__ */ +extern kmem_zone_t *xfs_trans_zone; +extern kmem_zone_t *xfs_log_item_desc_zone; #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 558c87ff0c4..cb0f3a84cc6 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * Copyright (c) 2008 Dave Chinner * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -17,565 +18,778 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_trace.h" #include "xfs_error.h" - -STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *); -STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *); -STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *); +#include "xfs_log.h" #ifdef DEBUG -STATIC void xfs_ail_check(xfs_ail_entry_t *); -#else -#define xfs_ail_check(a) +/* + * Check that the list is sorted as it should be. + */ +STATIC void +xfs_ail_check( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + xfs_log_item_t *prev_lip; + + if (list_empty(&ailp->xa_ail)) + return; + + /* + * Check the next and previous entries are valid. + */ + ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); + prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + + prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); + if (&prev_lip->li_ail != &ailp->xa_ail) + ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + + +} +#else /* !DEBUG */ +#define xfs_ail_check(a,l) #endif /* DEBUG */ +/* + * Return a pointer to the last item in the AIL. If the AIL is empty, then + * return NULL. + */ +static xfs_log_item_t * +xfs_ail_max( + struct xfs_ail *ailp) +{ + if (list_empty(&ailp->xa_ail)) + return NULL; + + return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail); +} /* - * This is called by the log manager code to determine the LSN - * of the tail of the log. This is exactly the LSN of the first - * item in the AIL. If the AIL is empty, then this function - * returns 0. + * Return a pointer to the item which follows the given item in the AIL. If + * the given item is the last item in the list, then return NULL. + */ +static xfs_log_item_t * +xfs_ail_next( + struct xfs_ail *ailp, + xfs_log_item_t *lip) +{ + if (lip->li_ail.next == &ailp->xa_ail) + return NULL; + + return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail); +} + +/* + * This is called by the log manager code to determine the LSN of the tail of + * the log. This is exactly the LSN of the first item in the AIL. If the AIL + * is empty, then this function returns 0. * - * We need the AIL lock in order to get a coherent read of the - * lsn of the last item in the AIL. + * We need the AIL lock in order to get a coherent read of the lsn of the last + * item in the AIL. */ xfs_lsn_t -xfs_trans_tail_ail( - xfs_mount_t *mp) +xfs_ail_min_lsn( + struct xfs_ail *ailp) { - xfs_lsn_t lsn; + xfs_lsn_t lsn = 0; xfs_log_item_t *lip; - SPLDECL(s); - AIL_LOCK(mp,s); - lip = xfs_ail_min(&(mp->m_ail)); - if (lip == NULL) { - lsn = (xfs_lsn_t)0; - } else { + spin_lock(&ailp->xa_lock); + lip = xfs_ail_min(ailp); + if (lip) lsn = lip->li_lsn; - } - AIL_UNLOCK(mp, s); + spin_unlock(&ailp->xa_lock); return lsn; } /* - * xfs_trans_push_ail - * - * This routine is called to move the tail of the AIL - * forward. It does this by trying to flush items in the AIL - * whose lsns are below the given threshold_lsn. - * - * The routine returns the lsn of the tail of the log. + * Return the maximum lsn held in the AIL, or zero if the AIL is empty. */ -xfs_lsn_t -xfs_trans_push_ail( - xfs_mount_t *mp, - xfs_lsn_t threshold_lsn) +static xfs_lsn_t +xfs_ail_max_lsn( + struct xfs_ail *ailp) { - xfs_lsn_t lsn; - xfs_log_item_t *lip; - int gen; - int restarts; - int lock_result; - int flush_log; - SPLDECL(s); + xfs_lsn_t lsn = 0; + xfs_log_item_t *lip; -#define XFS_TRANS_PUSH_AIL_RESTARTS 10 + spin_lock(&ailp->xa_lock); + lip = xfs_ail_max(ailp); + if (lip) + lsn = lip->li_lsn; + spin_unlock(&ailp->xa_lock); - AIL_LOCK(mp,s); - lip = xfs_trans_first_ail(mp, &gen); - if (lip == NULL || XFS_FORCED_SHUTDOWN(mp)) { - /* - * Just return if the AIL is empty. - */ - AIL_UNLOCK(mp, s); - return (xfs_lsn_t)0; - } + return lsn; +} - XFS_STATS_INC(xs_push_ail); +/* + * The cursor keeps track of where our current traversal is up to by tracking + * the next item in the list for us. However, for this to be safe, removing an + * object from the AIL needs to invalidate any cursor that points to it. hence + * the traversal cursor needs to be linked to the struct xfs_ail so that + * deletion can search all the active cursors for invalidation. + */ +STATIC void +xfs_trans_ail_cursor_init( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_add_tail(&cur->list, &ailp->xa_cursors); +} - /* - * While the item we are looking at is below the given threshold - * try to flush it out. Make sure to limit the number of times - * we allow xfs_trans_next_ail() to restart scanning from the - * beginning of the list. We'd like not to stop until we've at least - * tried to push on everything in the AIL with an LSN less than - * the given threshold. However, we may give up before that if - * we realize that we've been holding the AIL_LOCK for 'too long', - * blocking interrupts. Currently, too long is < 500us roughly. - */ - flush_log = 0; - restarts = 0; - while (((restarts < XFS_TRANS_PUSH_AIL_RESTARTS) && - (XFS_LSN_CMP(lip->li_lsn, threshold_lsn) < 0))) { - /* - * If we can lock the item without sleeping, unlock - * the AIL lock and flush the item. Then re-grab the - * AIL lock so we can look for the next item on the - * AIL. Since we unlock the AIL while we flush the - * item, the next routine may start over again at the - * the beginning of the list if anything has changed. - * That is what the generation count is for. - * - * If we can't lock the item, either its holder will flush - * it or it is already being flushed or it is being relogged. - * In any of these case it is being taken care of and we - * can just skip to the next item in the list. - */ - lock_result = IOP_TRYLOCK(lip); - switch (lock_result) { - case XFS_ITEM_SUCCESS: - AIL_UNLOCK(mp, s); - XFS_STATS_INC(xs_push_ail_success); - IOP_PUSH(lip); - AIL_LOCK(mp,s); - break; +/* + * Get the next item in the traversal and advance the cursor. If the cursor + * was invalidated (indicated by a lip of 1), restart the traversal. + */ +struct xfs_log_item * +xfs_trans_ail_cursor_next( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur) +{ + struct xfs_log_item *lip = cur->item; - case XFS_ITEM_PUSHBUF: - AIL_UNLOCK(mp, s); - XFS_STATS_INC(xs_push_ail_pushbuf); -#ifdef XFSRACEDEBUG - delay_for_intr(); - delay(300); -#endif - ASSERT(lip->li_ops->iop_pushbuf); - ASSERT(lip); - IOP_PUSHBUF(lip); - AIL_LOCK(mp,s); - break; + if ((__psint_t)lip & 1) + lip = xfs_ail_min(ailp); + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} - case XFS_ITEM_PINNED: - XFS_STATS_INC(xs_push_ail_pinned); - flush_log = 1; - break; +/* + * When the traversal is complete, we need to remove the cursor from the list + * of traversing cursors. + */ +void +xfs_trans_ail_cursor_done( + struct xfs_ail_cursor *cur) +{ + cur->item = NULL; + list_del_init(&cur->list); +} - case XFS_ITEM_LOCKED: - XFS_STATS_INC(xs_push_ail_locked); - break; +/* + * Invalidate any cursor that is pointing to this item. This is called when an + * item is removed from the AIL. Any cursor pointing to this object is now + * invalid and the traversal needs to be terminated so it doesn't reference a + * freed object. We set the low bit of the cursor item pointer so we can + * distinguish between an invalidation and the end of the list when getting the + * next item from the cursor. + */ +STATIC void +xfs_trans_ail_cursor_clear( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_ail_cursor *cur; - case XFS_ITEM_FLUSHING: - XFS_STATS_INC(xs_push_ail_flushing); - break; + list_for_each_entry(cur, &ailp->xa_cursors, list) { + if (cur->item == lip) + cur->item = (struct xfs_log_item *) + ((__psint_t)cur->item | 1); + } +} - default: - ASSERT(0); - break; - } +/* + * Find the first item in the AIL with the given @lsn by searching in ascending + * LSN order and initialise the cursor to point to the next item for a + * ascending traversal. Pass a @lsn of zero to initialise the cursor to the + * first item in the AIL. Returns NULL if the list is empty. + */ +xfs_log_item_t * +xfs_trans_ail_cursor_first( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; - lip = xfs_trans_next_ail(mp, lip, &gen, &restarts); - if (lip == NULL) { - break; - } - if (XFS_FORCED_SHUTDOWN(mp)) { - /* - * Just return if we shut down during the last try. - */ - AIL_UNLOCK(mp, s); - return (xfs_lsn_t)0; - } + xfs_trans_ail_cursor_init(ailp, cur); + if (lsn == 0) { + lip = xfs_ail_min(ailp); + goto out; } - if (flush_log) { - /* - * If something we need to push out was pinned, then - * push out the log so it will become unpinned and - * move forward in the AIL. - */ - AIL_UNLOCK(mp, s); - XFS_STATS_INC(xs_push_ail_flush); - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - AIL_LOCK(mp, s); + list_for_each_entry(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0) + goto out; } + return NULL; - lip = xfs_ail_min(&(mp->m_ail)); - if (lip == NULL) { - lsn = (xfs_lsn_t)0; - } else { - lsn = lip->li_lsn; - } +out: + if (lip) + cur->item = xfs_ail_next(ailp, lip); + return lip; +} - AIL_UNLOCK(mp, s); - return lsn; -} /* xfs_trans_push_ail */ +static struct xfs_log_item * +__xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + xfs_lsn_t lsn) +{ + xfs_log_item_t *lip; + list_for_each_entry_reverse(lip, &ailp->xa_ail, li_ail) { + if (XFS_LSN_CMP(lip->li_lsn, lsn) <= 0) + return lip; + } + return NULL; +} /* - * This is to be called when an item is unlocked that may have - * been in the AIL. It will wake up the first member of the AIL - * wait list if this item's unlocking might allow it to progress. - * If the item is in the AIL, then we need to get the AIL lock - * while doing our checking so we don't race with someone going - * to sleep waiting for this event in xfs_trans_push_ail(). + * Find the last item in the AIL with the given @lsn by searching in descending + * LSN order and initialise the cursor to point to that item. If there is no + * item with the value of @lsn, then it sets the cursor to the last item with an + * LSN lower than @lsn. Returns NULL if the list is empty. */ -void -xfs_trans_unlocked_item( - xfs_mount_t *mp, - xfs_log_item_t *lip) +struct xfs_log_item * +xfs_trans_ail_cursor_last( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn) +{ + xfs_trans_ail_cursor_init(ailp, cur); + cur->item = __xfs_trans_ail_cursor_last(ailp, lsn); + return cur->item; +} + +/* + * Splice the log item list into the AIL at the given LSN. We splice to the + * tail of the given LSN to maintain insert order for push traversals. The + * cursor is optional, allowing repeated updates to the same LSN to avoid + * repeated traversals. This should not be called with an empty list. + */ +static void +xfs_ail_splice( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct list_head *list, + xfs_lsn_t lsn) { - xfs_log_item_t *min_lip; + struct xfs_log_item *lip; + + ASSERT(!list_empty(list)); /* - * If we're forcibly shutting down, we may have - * unlocked log items arbitrarily. The last thing - * we want to do is to move the tail of the log - * over some potentially valid data. + * Use the cursor to determine the insertion point if one is + * provided. If not, or if the one we got is not valid, + * find the place in the AIL where the items belong. */ - if (!(lip->li_flags & XFS_LI_IN_AIL) || - XFS_FORCED_SHUTDOWN(mp)) { - return; - } + lip = cur ? cur->item : NULL; + if (!lip || (__psint_t) lip & 1) + lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* - * This is the one case where we can call into xfs_ail_min() - * without holding the AIL lock because we only care about the - * case where we are at the tail of the AIL. If the object isn't - * at the tail, it doesn't matter what result we get back. This - * is slightly racy because since we were just unlocked, we could - * go to sleep between the call to xfs_ail_min and the call to - * xfs_log_move_tail, have someone else lock us, commit to us disk, - * move us out of the tail of the AIL, and then we wake up. However, - * the call to xfs_log_move_tail() doesn't do anything if there's - * not enough free space to wake people up so we're safe calling it. + * If a cursor is provided, we know we're processing the AIL + * in lsn order, and future items to be spliced in will + * follow the last one being inserted now. Update the + * cursor to point to that last item, now while we have a + * reliable pointer to it. */ - min_lip = xfs_ail_min(&mp->m_ail); - - if (min_lip == lip) - xfs_log_move_tail(mp, 1); -} /* xfs_trans_unlocked_item */ + if (cur) + cur->item = list_entry(list->prev, struct xfs_log_item, li_ail); + /* + * Finally perform the splice. Unless the AIL was empty, + * lip points to the item in the AIL _after_ which the new + * items should go. If lip is null the AIL was empty, so + * the new items go at the head of the AIL. + */ + if (lip) + list_splice(list, &lip->li_ail); + else + list_splice(list, &ailp->xa_ail); +} /* - * Update the position of the item in the AIL with the new - * lsn. If it is not yet in the AIL, add it. Otherwise, move - * it to its new position by removing it and re-adding it. - * - * Wakeup anyone with an lsn less than the item's lsn. If the item - * we move in the AIL is the minimum one, update the tail lsn in the - * log manager. - * - * Increment the AIL's generation count to indicate that the tree - * has changed. - * - * This function must be called with the AIL lock held. The lock - * is dropped before returning, so the caller must pass in the - * cookie returned by AIL_LOCK. + * Delete the given item from the AIL. Return a pointer to the item. */ -void -xfs_trans_update_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - xfs_lsn_t lsn, - unsigned long s) +static void +xfs_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip) { - xfs_ail_entry_t *ailp; - xfs_log_item_t *dlip=NULL; - xfs_log_item_t *mlip; /* ptr to minimum lip */ + xfs_ail_check(ailp, lip); + list_del(&lip->li_ail); + xfs_trans_ail_cursor_clear(ailp, lip); +} - ailp = &(mp->m_ail); - mlip = xfs_ail_min(ailp); +static long +xfsaild_push( + struct xfs_ail *ailp) +{ + xfs_mount_t *mp = ailp->xa_mount; + struct xfs_ail_cursor cur; + xfs_log_item_t *lip; + xfs_lsn_t lsn; + xfs_lsn_t target; + long tout; + int stuck = 0; + int flushing = 0; + int count = 0; - if (lip->li_flags & XFS_LI_IN_AIL) { - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); - } else { - lip->li_flags |= XFS_LI_IN_AIL; + /* + * If we encountered pinned items or did not finish writing out all + * buffers the last time we ran, force the log first and wait for it + * before pushing again. + */ + if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 && + (!list_empty_careful(&ailp->xa_buf_list) || + xfs_ail_min_lsn(ailp))) { + ailp->xa_log_flush = 0; + + XFS_STATS_INC(xs_push_ail_flush); + xfs_log_force(mp, XFS_LOG_SYNC); } - lip->li_lsn = lsn; + spin_lock(&ailp->xa_lock); - xfs_ail_insert(ailp, lip); - mp->m_ail_gen++; + /* barrier matches the xa_target update in xfs_ail_push() */ + smp_rmb(); + target = ailp->xa_target; + ailp->xa_target_prev = target; - if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail)); - AIL_UNLOCK(mp, s); - xfs_log_move_tail(mp, mlip->li_lsn); - } else { - AIL_UNLOCK(mp, s); + lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); + if (!lip) { + /* + * If the AIL is empty or our push has reached the end we are + * done now. + */ + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + goto out_done; } + XFS_STATS_INC(xs_push_ail); -} /* xfs_trans_update_ail */ + lsn = lip->li_lsn; + while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { + int lock_result; -/* - * Delete the given item from the AIL. It must already be in - * the AIL. - * - * Wakeup anyone with an lsn less than item's lsn. If the item - * we delete in the AIL is the minimum one, update the tail lsn in the - * log manager. - * - * Clear the IN_AIL flag from the item, reset its lsn to 0, and - * bump the AIL's generation count to indicate that the tree - * has changed. - * - * This function must be called with the AIL lock held. The lock - * is dropped before returning, so the caller must pass in the - * cookie returned by AIL_LOCK. - */ -void -xfs_trans_delete_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - unsigned long s) -{ - xfs_ail_entry_t *ailp; - xfs_log_item_t *dlip; - xfs_log_item_t *mlip; + /* + * Note that iop_push may unlock and reacquire the AIL lock. We + * rely on the AIL cursor implementation to be able to deal with + * the dropped lock. + */ + lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + switch (lock_result) { + case XFS_ITEM_SUCCESS: + XFS_STATS_INC(xs_push_ail_success); + trace_xfs_ail_push(lip); - if (lip->li_flags & XFS_LI_IN_AIL) { - ailp = &(mp->m_ail); - mlip = xfs_ail_min(ailp); - dlip = xfs_ail_delete(ailp, lip); - ASSERT(dlip == lip); + ailp->xa_last_pushed_lsn = lsn; + break; + case XFS_ITEM_FLUSHING: + /* + * The item or its backing buffer is already beeing + * flushed. The typical reason for that is that an + * inode buffer is locked because we already pushed the + * updates to it as part of inode clustering. + * + * We do not want to to stop flushing just because lots + * of items are already beeing flushed, but we need to + * re-try the flushing relatively soon if most of the + * AIL is beeing flushed. + */ + XFS_STATS_INC(xs_push_ail_flushing); + trace_xfs_ail_flushing(lip); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; - mp->m_ail_gen++; + flushing++; + ailp->xa_last_pushed_lsn = lsn; + break; - if (mlip == dlip) { - mlip = xfs_ail_min(&(mp->m_ail)); - AIL_UNLOCK(mp, s); - xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); - } else { - AIL_UNLOCK(mp, s); + case XFS_ITEM_PINNED: + XFS_STATS_INC(xs_push_ail_pinned); + trace_xfs_ail_pinned(lip); + + stuck++; + ailp->xa_log_flush++; + break; + case XFS_ITEM_LOCKED: + XFS_STATS_INC(xs_push_ail_locked); + trace_xfs_ail_locked(lip); + + stuck++; + break; + default: + ASSERT(0); + break; } - } - else { + + count++; + /* - * If the file system is not being shutdown, we are in - * serious trouble if we get to this stage. + * Are there too many items we can't do anything with? + * + * If we we are skipping too many items because we can't flush + * them or they are already being flushed, we back off and + * given them time to complete whatever operation is being + * done. i.e. remove pressure from the AIL while we can't make + * progress so traversals don't slow down further inserts and + * removals to/from the AIL. + * + * The value of 100 is an arbitrary magic number based on + * observation. */ - if (XFS_FORCED_SHUTDOWN(mp)) - AIL_UNLOCK(mp, s); - else { - xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, - "%s: attempting to delete a log item that is not in the AIL", - __FUNCTION__); - AIL_UNLOCK(mp, s); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + if (stuck > 100) + break; + + lip = xfs_trans_ail_cursor_next(ailp, &cur); + if (lip == NULL) + break; + lsn = lip->li_lsn; } -} + xfs_trans_ail_cursor_done(&cur); + spin_unlock(&ailp->xa_lock); + if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list)) + ailp->xa_log_flush++; + if (!count || XFS_LSN_CMP(lsn, target) >= 0) { +out_done: + /* + * We reached the target or the AIL is empty, so wait a bit + * longer for I/O to complete and remove pushed items from the + * AIL before we start the next scan from the start of the AIL. + */ + tout = 50; + ailp->xa_last_pushed_lsn = 0; + } else if (((stuck + flushing) * 100) / count > 90) { + /* + * Either there is a lot of contention on the AIL or we are + * stuck due to operations in progress. "Stuck" in this case + * is defined as >90% of the items we tried to push were stuck. + * + * Backoff a bit more to allow some I/O to complete before + * restarting from the start of the AIL. This prevents us from + * spinning on the same items, and if they are pinned will all + * the restart to issue a log force to unpin the stuck items. + */ + tout = 20; + ailp->xa_last_pushed_lsn = 0; + } else { + /* + * Assume we have more work to do in a short while. + */ + tout = 10; + } -/* - * Return the item in the AIL with the smallest lsn. - * Return the current tree generation number for use - * in calls to xfs_trans_next_ail(). - */ -xfs_log_item_t * -xfs_trans_first_ail( - xfs_mount_t *mp, - int *gen) + return tout; +} + +static int +xfsaild( + void *data) { - xfs_log_item_t *lip; + struct xfs_ail *ailp = data; + long tout = 0; /* milliseconds */ - lip = xfs_ail_min(&(mp->m_ail)); - *gen = (int)mp->m_ail_gen; + current->flags |= PF_MEMALLOC; - return (lip); -} + while (!kthread_should_stop()) { + if (tout && tout <= 20) + __set_current_state(TASK_KILLABLE); + else + __set_current_state(TASK_INTERRUPTIBLE); -/* - * If the generation count of the tree has not changed since the - * caller last took something from the AIL, then return the elmt - * in the tree which follows the one given. If the count has changed, - * then return the minimum elmt of the AIL and bump the restarts counter - * if one is given. - */ -xfs_log_item_t * -xfs_trans_next_ail( - xfs_mount_t *mp, - xfs_log_item_t *lip, - int *gen, - int *restarts) -{ - xfs_log_item_t *nlip; + spin_lock(&ailp->xa_lock); - ASSERT(mp && lip && gen); - if (mp->m_ail_gen == *gen) { - nlip = xfs_ail_next(&(mp->m_ail), lip); - } else { - nlip = xfs_ail_min(&(mp->m_ail)); - *gen = (int)mp->m_ail_gen; - if (restarts != NULL) { - XFS_STATS_INC(xs_push_ail_restarts); - (*restarts)++; + /* + * Idle if the AIL is empty and we are not racing with a target + * update. We check the AIL after we set the task to a sleep + * state to guarantee that we either catch an xa_target update + * or that a wake_up resets the state to TASK_RUNNING. + * Otherwise, we run the risk of sleeping indefinitely. + * + * The barrier matches the xa_target update in xfs_ail_push(). + */ + smp_rmb(); + if (!xfs_ail_min(ailp) && + ailp->xa_target == ailp->xa_target_prev) { + spin_unlock(&ailp->xa_lock); + schedule(); + tout = 0; + continue; } - } + spin_unlock(&ailp->xa_lock); - return (nlip); -} + if (tout) + schedule_timeout(msecs_to_jiffies(tout)); + __set_current_state(TASK_RUNNING); -/* - * The active item list (AIL) is a doubly linked list of log - * items sorted by ascending lsn. The base of the list is - * a forw/back pointer pair embedded in the xfs mount structure. - * The base is initialized with both pointers pointing to the - * base. This case always needs to be distinguished, because - * the base has no lsn to look at. We almost always insert - * at the end of the list, so on inserts we search from the - * end of the list to find where the new item belongs. - */ + try_to_freeze(); -/* - * Initialize the doubly linked list to point only to itself. - */ -void -xfs_trans_ail_init( - xfs_mount_t *mp) -{ - mp->m_ail.ail_forw = (xfs_log_item_t*)&(mp->m_ail); - mp->m_ail.ail_back = (xfs_log_item_t*)&(mp->m_ail); + tout = xfsaild_push(ailp); + } + + return 0; } /* - * Insert the given log item into the AIL. - * We almost always insert at the end of the list, so on inserts - * we search from the end of the list to find where the - * new item belongs. + * This routine is called to move the tail of the AIL forward. It does this by + * trying to flush items in the AIL whose lsns are below the given + * threshold_lsn. + * + * The push is run asynchronously in a workqueue, which means the caller needs + * to handle waiting on the async flush for space to become available. + * We don't want to interrupt any push that is in progress, hence we only queue + * work if we set the pushing bit approriately. + * + * We do this unlocked - we only need to know whether there is anything in the + * AIL at the time we are called. We don't need to access the contents of + * any of the objects, so the lock is not needed. */ -STATIC void -xfs_ail_insert( - xfs_ail_entry_t *base, - xfs_log_item_t *lip) -/* ARGSUSED */ +void +xfs_ail_push( + struct xfs_ail *ailp, + xfs_lsn_t threshold_lsn) { - xfs_log_item_t *next_lip; + xfs_log_item_t *lip; + + lip = xfs_ail_min(ailp); + if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) || + XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0) + return; /* - * If the list is empty, just insert the item. + * Ensure that the new target is noticed in push code before it clears + * the XFS_AIL_PUSHING_BIT. */ - if (base->ail_back == (xfs_log_item_t*)base) { - base->ail_forw = lip; - base->ail_back = lip; - lip->li_ail.ail_forw = (xfs_log_item_t*)base; - lip->li_ail.ail_back = (xfs_log_item_t*)base; - return; - } + smp_wmb(); + xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn); + smp_wmb(); - next_lip = base->ail_back; - while ((next_lip != (xfs_log_item_t*)base) && - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) { - next_lip = next_lip->li_ail.ail_back; - } - ASSERT((next_lip == (xfs_log_item_t*)base) || - (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); - lip->li_ail.ail_forw = next_lip->li_ail.ail_forw; - lip->li_ail.ail_back = next_lip; - next_lip->li_ail.ail_forw = lip; - lip->li_ail.ail_forw->li_ail.ail_back = lip; - - xfs_ail_check(base); - return; + wake_up_process(ailp->xa_task); } /* - * Delete the given item from the AIL. Return a pointer to the item. + * Push out all items in the AIL immediately */ -/*ARGSUSED*/ -STATIC xfs_log_item_t * -xfs_ail_delete( - xfs_ail_entry_t *base, - xfs_log_item_t *lip) -/* ARGSUSED */ +void +xfs_ail_push_all( + struct xfs_ail *ailp) { - lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back; - lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw; - lip->li_ail.ail_forw = NULL; - lip->li_ail.ail_back = NULL; + xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp); - xfs_ail_check(base); - return lip; + if (threshold_lsn) + xfs_ail_push(ailp, threshold_lsn); } /* - * Return a pointer to the first item in the AIL. - * If the AIL is empty, then return NULL. + * Push out all items in the AIL immediately and wait until the AIL is empty. */ -STATIC xfs_log_item_t * -xfs_ail_min( - xfs_ail_entry_t *base) -/* ARGSUSED */ +void +xfs_ail_push_all_sync( + struct xfs_ail *ailp) { - register xfs_log_item_t *forw = base->ail_forw; - if (forw == (xfs_log_item_t*)base) { - return NULL; + struct xfs_log_item *lip; + DEFINE_WAIT(wait); + + spin_lock(&ailp->xa_lock); + while ((lip = xfs_ail_max(ailp)) != NULL) { + prepare_to_wait(&ailp->xa_empty, &wait, TASK_UNINTERRUPTIBLE); + ailp->xa_target = lip->li_lsn; + wake_up_process(ailp->xa_task); + spin_unlock(&ailp->xa_lock); + schedule(); + spin_lock(&ailp->xa_lock); } - return forw; + spin_unlock(&ailp->xa_lock); + + finish_wait(&ailp->xa_empty, &wait); } /* - * Return a pointer to the item which follows - * the given item in the AIL. If the given item - * is the last item in the list, then return NULL. + * xfs_trans_ail_update - bulk AIL insertion operation. + * + * @xfs_trans_ail_update takes an array of log items that all need to be + * positioned at the same LSN in the AIL. If an item is not in the AIL, it will + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. If we move the first item in the AIL, update the log tail to + * match the new minimum LSN in the AIL. + * + * This function takes the AIL lock once to execute the update operations on + * all the items in the array, and as such should not be called with the AIL + * lock held. As a result, once we have the AIL lock, we need to check each log + * item LSN to confirm it needs to be moved forward in the AIL. + * + * To optimise the insert operation, we delete all the items from the AIL in + * the first pass, moving them into a temporary list, then splice the temporary + * list into the correct position in the AIL. This avoids needing to do an + * insert operation on every item. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ -STATIC xfs_log_item_t * -xfs_ail_next( - xfs_ail_entry_t *base, - xfs_log_item_t *lip) -/* ARGSUSED */ +void +xfs_trans_ail_update_bulk( + struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, + int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock) { - if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) { - return NULL; + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; + LIST_HEAD(tmp); + + ASSERT(nr_items > 0); /* Not required, but true. */ + mlip = xfs_ail_min(ailp); + + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (lip->li_flags & XFS_LI_IN_AIL) { + /* check if we really need to move the item */ + if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) + continue; + + trace_xfs_ail_move(lip, lip->li_lsn, lsn); + xfs_ail_delete(ailp, lip); + if (mlip == lip) + mlip_changed = 1; + } else { + lip->li_flags |= XFS_LI_IN_AIL; + trace_xfs_ail_insert(lip, 0, lsn); + } + lip->li_lsn = lsn; + list_add(&lip->li_ail, &tmp); } - return lip->li_ail.ail_forw; + if (!list_empty(&tmp)) + xfs_ail_splice(ailp, cur, &tmp, lsn); + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); + } } -#ifdef DEBUG /* - * Check that the list is sorted as it should be. + * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL + * + * @xfs_trans_ail_delete_bulk takes an array of log items that all need to + * removed from the AIL. The caller is already holding the AIL lock, and done + * all the checks necessary to ensure the items passed in via @log_items are + * ready for deletion. This includes checking that the items are in the AIL. + * + * For each log item to be removed, unlink it from the AIL, clear the IN_AIL + * flag from the item and reset the item's lsn to 0. If we remove the first + * item in the AIL, update the log tail to match the new minimum LSN in the + * AIL. + * + * This function will not drop the AIL lock until all items are removed from + * the AIL to minimise the amount of lock traffic on the AIL. This does not + * greatly increase the AIL hold time, but does significantly reduce the amount + * of traffic on the lock, especially during IO completion. + * + * This function must be called with the AIL lock held. The lock is dropped + * before returning. */ -STATIC void -xfs_ail_check( - xfs_ail_entry_t *base) +void +xfs_trans_ail_delete_bulk( + struct xfs_ail *ailp, + struct xfs_log_item **log_items, + int nr_items, + int shutdown_type) __releases(ailp->xa_lock) { - xfs_log_item_t *lip; - xfs_log_item_t *prev_lip; + xfs_log_item_t *mlip; + int mlip_changed = 0; + int i; - lip = base->ail_forw; - if (lip == (xfs_log_item_t*)base) { - /* - * Make sure the pointers are correct when the list - * is empty. - */ - ASSERT(base->ail_back == (xfs_log_item_t*)base); - return; - } + mlip = xfs_ail_min(ailp); - /* - * Walk the list checking forward and backward pointers, - * lsn ordering, and that every entry has the XFS_LI_IN_AIL - * flag set. - */ - prev_lip = (xfs_log_item_t*)base; - while (lip != (xfs_log_item_t*)base) { - if (prev_lip != (xfs_log_item_t*)base) { - ASSERT(prev_lip->li_ail.ail_forw == lip); - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); + for (i = 0; i < nr_items; i++) { + struct xfs_log_item *lip = log_items[i]; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + struct xfs_mount *mp = ailp->xa_mount; + + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); + } + return; } - ASSERT(lip->li_ail.ail_back == prev_lip); - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = lip; - lip = lip->li_ail.ail_forw; + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + if (mlip == lip) + mlip_changed = 1; + } + + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + spin_unlock(&ailp->xa_lock); + + xfs_log_space_wake(ailp->xa_mount); + } else { + spin_unlock(&ailp->xa_lock); } - ASSERT(lip == (xfs_log_item_t*)base); - ASSERT(base->ail_back == prev_lip); } -#endif /* DEBUG */ + +int +xfs_trans_ail_init( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp; + + ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); + if (!ailp) + return ENOMEM; + + ailp->xa_mount = mp; + INIT_LIST_HEAD(&ailp->xa_ail); + INIT_LIST_HEAD(&ailp->xa_cursors); + spin_lock_init(&ailp->xa_lock); + INIT_LIST_HEAD(&ailp->xa_buf_list); + init_waitqueue_head(&ailp->xa_empty); + + ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", + ailp->xa_mount->m_fsname); + if (IS_ERR(ailp->xa_task)) + goto out_free_ailp; + + mp->m_ail = ailp; + return 0; + +out_free_ailp: + kmem_free(ailp); + return ENOMEM; +} + +void +xfs_trans_ail_destroy( + xfs_mount_t *mp) +{ + struct xfs_ail *ailp = mp->m_ail; + + kthread_stop(ailp->xa_task); + kmem_free(ailp); +} diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 60b6b898022..b8eef0549f3 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -17,34 +17,111 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" +#include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_error.h" -#include "xfs_rw.h" +#include "xfs_trace.h" + +/* + * Check to see if a buffer matching the given parameters is already + * a part of the given transaction. + */ +STATIC struct xfs_buf * +xfs_trans_buf_item_match( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps) +{ + struct xfs_log_item_desc *lidp; + struct xfs_buf_log_item *blip; + int len = 0; + int i; + + for (i = 0; i < nmaps; i++) + len += map[i].bm_len; + + list_for_each_entry(lidp, &tp->t_items, lid_trans) { + blip = (struct xfs_buf_log_item *)lidp->lid_item; + if (blip->bli_item.li_type == XFS_LI_BUF && + blip->bli_buf->b_target == target && + XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + blip->bli_buf->b_length == len) { + ASSERT(blip->bli_buf->b_map_count == nmaps); + return blip->bli_buf; + } + } + + return NULL; +} + +/* + * Add the locked buffer to the transaction. + * + * The buffer must be locked, and it cannot be associated with any + * transaction. + * + * If the buffer does not yet have a buf log item associated with it, + * then allocate one for it. Then add the buf item to the transaction. + */ +STATIC void +_xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp, + int reset_recur) +{ + struct xfs_buf_log_item *bip; + + ASSERT(bp->b_transp == NULL); + + /* + * The xfs_buf_log_item pointer is stored in b_fsprivate. If + * it doesn't have one yet, then allocate one and initialize it. + * The checks to see if one is there are in xfs_buf_item_init(). + */ + xfs_buf_item_init(bp, tp->t_mountp); + bip = bp->b_fspriv; + ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); + ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); + if (reset_recur) + bip->bli_recur = 0; + + /* + * Take a reference for this transaction on the buf item. + */ + atomic_inc(&bip->bli_refcount); + /* + * Get a log_item_desc to point at the new item. + */ + xfs_trans_add_item(tp, &bip->bli_item); -STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); -STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, - xfs_daddr_t, int); + /* + * Initialize b_fsprivate2 so we can find it with incore_match() + * in xfs_trans_get_buf() and friends above. + */ + bp->b_transp = tp; +} + +void +xfs_trans_bjoin( + struct xfs_trans *tp, + struct xfs_buf *bp) +{ + _xfs_trans_bjoin(tp, bp, 0); + trace_xfs_trans_bjoin(bp->b_fspriv); +} /* * Get and lock the buffer for the caller if it is not already @@ -52,38 +129,22 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, * within the transaction, just increment its lock recursion count * and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use get_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ -xfs_buf_t * -xfs_trans_get_buf(xfs_trans_t *tp, - xfs_buftarg_t *target_dev, - xfs_daddr_t blkno, - int len, - uint flags) +struct xfs_buf * +xfs_trans_get_buf_map( + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; - if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; - - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) { - bp = xfs_buf_get_flags(target_dev, blkno, len, - flags | BUF_BUSY); - return(bp); - } + if (!tp) + return xfs_buf_get_map(target, map, nmaps, flags); /* * If we find the buffer in the cache with this transaction @@ -91,86 +152,32 @@ xfs_trans_get_buf(xfs_trans_t *tp, * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target_dev, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target_dev, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); + ASSERT(xfs_buf_islocked(bp)); if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { - xfs_buftrace("TRANS GET RECUR SHUT", bp); - XFS_BUF_SUPER_STALE(bp); + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); } - /* - * If the buffer is stale then it was binval'ed - * since last read. This doesn't matter since the - * caller isn't allowed to use the data anyway. - */ - else if (XFS_BUF_ISSTALE(bp)) { - xfs_buftrace("TRANS GET RECUR STALE", bp); - ASSERT(!XFS_BUF_ISDELAYWRITE(bp)); - } - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; - xfs_buftrace("TRANS GET RECUR", bp); - xfs_buf_item_trace("GET RECUR", bip); + trace_xfs_trans_get_buf_recur(bip); return (bp); } - /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_get_flags(target_dev, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_get_map(target, map, nmaps, flags); if (bp == NULL) { return NULL; } - ASSERT(!XFS_BUF_GETERROR(bp)); - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; + ASSERT(!bp->b_error); - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - xfs_buftrace("TRANS GET", bp); - xfs_buf_item_trace("GET", bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_get_buf(bp->b_fspriv); return (bp); } @@ -205,54 +212,21 @@ xfs_trans_getsb(xfs_trans_t *tp, * recursion count and return the buffer to the caller. */ bp = mp->m_sb_bp; - if (XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp) { - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + if (bp->b_transp == tp) { + bip = bp->b_fspriv; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; - xfs_buf_item_trace("GETSB RECUR", bip); + trace_xfs_trans_getsb_recur(bip); return (bp); } bp = xfs_getsb(mp, flags); - if (bp == NULL) { + if (bp == NULL) return NULL; - } - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, mp); - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - xfs_buf_item_trace("GETSB", bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_getsb(bp->b_fspriv); return (bp); } @@ -270,55 +244,49 @@ int xfs_error_mod = 33; * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * - * Use the fast path function xfs_trans_buf_item_match() or the buffer - * cache routine incore_match() to find the buffer - * if it is already owned by this transaction. - * - * If we don't already own the buffer, use read_buf() to get it. - * If it doesn't yet have an associated xfs_buf_log_item structure, - * then allocate one and add the item to this transaction. - * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ int -xfs_trans_read_buf( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len, - uint flags, - xfs_buf_t **bpp) +xfs_trans_read_buf_map( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buftarg *target, + struct xfs_buf_map *map, + int nmaps, + xfs_buf_flags_t flags, + struct xfs_buf **bpp, + const struct xfs_buf_ops *ops) { xfs_buf_t *bp; xfs_buf_log_item_t *bip; int error; - if (flags == 0) - flags = XFS_BUF_LOCK | XFS_BUF_MAPPED; - - /* - * Default to a normal get_buf() call if the tp is NULL. - */ - if (tp == NULL) { - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + *bpp = NULL; + if (!tp) { + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (!bp) - return XFS_ERROR(ENOMEM); - - if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); + return (flags & XBF_TRYLOCK) ? + EAGAIN : XFS_ERROR(ENOMEM); + + if (bp->b_error) { + error = bp->b_error; + xfs_buf_ioerror_alert(bp, __func__); + XFS_BUF_UNDONE(bp); + xfs_buf_stale(bp); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG - if (xfs_do_error && (bp != NULL)) { + if (xfs_do_error) { if (xfs_error_target == target) { if (((xfs_req_num++) % xfs_error_mod) == 0) { xfs_buf_relse(bp); - cmn_err(CE_DEBUG, "Returning error!\n"); + xfs_debug(mp, "Returning error!"); return XFS_ERROR(EIO); } } @@ -338,36 +306,45 @@ xfs_trans_read_buf( * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ - if (tp->t_items.lic_next == NULL) { - bp = xfs_trans_buf_item_match(tp, target, blkno, len); - } else { - bp = xfs_trans_buf_item_match_all(tp, target, blkno, len); - } + bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { - ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT((XFS_BUF_ISERROR(bp)) == 0); + ASSERT(xfs_buf_islocked(bp)); + ASSERT(bp->b_transp == tp); + ASSERT(bp->b_fspriv != NULL); + ASSERT(!bp->b_error); if (!(XFS_BUF_ISDONE(bp))) { - xfs_buftrace("READ_BUF_INCORE !DONE", bp); + trace_xfs_trans_read_buf_io(bp, _RET_IP_); ASSERT(!XFS_BUF_ISASYNC(bp)); + ASSERT(bp->b_iodone == NULL); XFS_BUF_READ(bp); - xfsbdstrat(tp->t_mountp, bp); - xfs_iowait(bp); - if (XFS_BUF_GETERROR(bp) != 0) { - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); + bp->b_ops = ops; + + /* + * XXX(hch): clean up the error handling here to be less + * of a mess.. + */ + if (XFS_FORCED_SHUTDOWN(mp)) { + trace_xfs_bdstrat_shut(bp, _RET_IP_); + xfs_bioerror_relse(bp); + } else { + xfs_buf_iorequest(bp); + } + + error = xfs_buf_iowait(bp); + if (error) { + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* - * We can gracefully recover from most - * read errors. Ones we can't are those - * that happen after the transaction's - * already dirty. + * We can gracefully recover from most read + * errors. Ones we can't are those that happen + * after the transaction's already dirty. */ if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } } @@ -376,44 +353,39 @@ xfs_trans_read_buf( * brelse it either. Just get out. */ if (XFS_FORCED_SHUTDOWN(mp)) { - xfs_buftrace("READ_BUF_INCORE XFSSHUTDN", bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); *bpp = NULL; return XFS_ERROR(EIO); } - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); + bip = bp->b_fspriv; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); - xfs_buf_item_trace("READ RECUR", bip); + trace_xfs_trans_read_buf_recur(bip); *bpp = bp; return 0; } - /* - * We always specify the BUF_BUSY flag within a transaction so - * that get_buf does not try to push out a delayed write buffer - * which might cause another transaction to take place (if the - * buffer was delayed alloc). Such recursive transactions can - * easily deadlock with our current transaction as well as cause - * us to run out of stack space. - */ - bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); + bp = xfs_buf_read_map(target, map, nmaps, flags, ops); if (bp == NULL) { *bpp = NULL; - return 0; + return (flags & XBF_TRYLOCK) ? + 0 : XFS_ERROR(ENOMEM); } - if (XFS_BUF_GETERROR(bp) != 0) { - XFS_BUF_SUPER_STALE(bp); - xfs_buftrace("READ ERROR", bp); - error = XFS_BUF_GETERROR(bp); - - xfs_ioerror_alert("xfs_trans_read_buf", mp, - bp, blkno); + if (bp->b_error) { + error = bp->b_error; + xfs_buf_stale(bp); + XFS_BUF_DONE(bp); + xfs_buf_ioerror_alert(bp, __func__); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); + + /* bad CRC means corrupted metadata */ + if (error == EFSBADCRC) + error = EFSCORRUPTED; return error; } #ifdef DEBUG @@ -423,7 +395,7 @@ xfs_trans_read_buf( xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); xfs_buf_relse(bp); - cmn_err(CE_DEBUG, "Returning trans error!\n"); + xfs_debug(mp, "Returning trans error!"); return XFS_ERROR(EIO); } } @@ -432,65 +404,19 @@ xfs_trans_read_buf( if (XFS_FORCED_SHUTDOWN(mp)) goto shutdown_abort; - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - - /* - * Set the recursion count for the buffer within this transaction - * to 0. - */ - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - bip->bli_recur = 0; - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip); + _xfs_trans_bjoin(tp, bp, 1); + trace_xfs_trans_read_buf(bp->b_fspriv); - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - xfs_buftrace("TRANS READ", bp); - xfs_buf_item_trace("READ", bip); *bpp = bp; return 0; shutdown_abort: - /* - * the theory here is that buffer is good but we're - * bailing out because the filesystem is being forcibly - * shut down. So we should leave the b_flags alone since - * the buffer's not staled and just get out. - */ -#if defined(DEBUG) - if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) - cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); -#endif - ASSERT((XFS_BUF_BFLAGS(bp) & (XFS_B_STALE|XFS_B_DELWRI)) != - (XFS_B_STALE|XFS_B_DELWRI)); - - xfs_buftrace("READ_BUF XFSSHUTDN", bp); + trace_xfs_trans_read_buf_shut(bp, _RET_IP_); xfs_buf_relse(bp); *bpp = NULL; return XFS_ERROR(EIO); } - /* * Release the buffer bp which was previously acquired with one of the * xfs_trans_... buffer allocation routines if the buffer has not @@ -511,45 +437,24 @@ xfs_trans_brelse(xfs_trans_t *tp, xfs_buf_t *bp) { xfs_buf_log_item_t *bip; - xfs_log_item_t *lip; - xfs_log_item_desc_t *lidp; /* * Default to a normal brelse() call if the tp is NULL. */ if (tp == NULL) { - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); - /* - * If there's a buf log item attached to the buffer, - * then let the AIL know that the buffer is being - * unlocked. - */ - if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { - lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); - if (lip->li_type == XFS_LI_BUF) { - bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); - xfs_trans_unlocked_item( - bip->bli_item.li_mountp, - lip); - } - } + ASSERT(bp->b_transp == NULL); xfs_buf_relse(bp); return; } - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + bip = bp->b_fspriv; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); - /* - * Find the item descriptor pointing to this buffer's - * log item. It must be there. - */ - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); + trace_xfs_trans_brelse(bip); /* * If the release is just for a recursive lock, @@ -557,7 +462,6 @@ xfs_trans_brelse(xfs_trans_t *tp, */ if (bip->bli_recur > 0) { bip->bli_recur--; - xfs_buf_item_trace("RELSE RECUR", bip); return; } @@ -565,10 +469,8 @@ xfs_trans_brelse(xfs_trans_t *tp, * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (lidp->lid_flags & XFS_LID_DIRTY) { - xfs_buf_item_trace("RELSE DIRTY", bip); + if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) return; - } /* * If the buffer has been invalidated, then we can't release @@ -576,18 +478,15 @@ xfs_trans_brelse(xfs_trans_t *tp, * as part of this transaction. This prevents us from pulling * the item from the AIL before we should. */ - if (bip->bli_flags & XFS_BLI_STALE) { - xfs_buf_item_trace("RELSE STALE", bip); + if (bip->bli_flags & XFS_BLI_STALE) return; - } ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - xfs_buf_item_trace("RELSE", bip); /* * Free up the log item descriptor tracking the released item. */ - xfs_trans_free_item(tp, lidp); + xfs_trans_del_item(&bip->bli_item); /* * Clear the hold flag in the buf log item if it is set. @@ -618,73 +517,15 @@ xfs_trans_brelse(xfs_trans_t *tp, ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); xfs_buf_item_relse(bp); - bip = NULL; - } - XFS_BUF_SET_FSPRIVATE2(bp, NULL); - - /* - * If we've still got a buf log item on the buffer, then - * tell the AIL that the buffer is being unlocked. - */ - if (bip != NULL) { - xfs_trans_unlocked_item(bip->bli_item.li_mountp, - (xfs_log_item_t*)bip); } + bp->b_transp = NULL; xfs_buf_relse(bp); - return; -} - -/* - * Add the locked buffer to the transaction. - * The buffer must be locked, and it cannot be associated with any - * transaction. - * - * If the buffer does not yet have a buf log item associated with it, - * then allocate one for it. Then add the buf item to the transaction. - */ -void -xfs_trans_bjoin(xfs_trans_t *tp, - xfs_buf_t *bp) -{ - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL); - - /* - * The xfs_buf_log_item pointer is stored in b_fsprivate. If - * it doesn't have one yet, then allocate one and initialize it. - * The checks to see if one is there are in xfs_buf_item_init(). - */ - xfs_buf_item_init(bp, tp->t_mountp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); - ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); - - /* - * Take a reference for this transaction on the buf item. - */ - atomic_inc(&bip->bli_refcount); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ - XFS_BUF_SET_FSPRIVATE2(bp, tp); - - xfs_buf_item_trace("BJOIN", bip); } /* * Mark the buffer as not needing to be unlocked when the buf item's - * IOP_UNLOCK() routine is called. The buffer must already be locked + * iop_unlock() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ @@ -692,18 +533,16 @@ void xfs_trans_bhold(xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + xfs_buf_log_item_t *bip = bp->b_fspriv; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); + bip->bli_flags |= XFS_BLI_HOLD; - xfs_buf_item_trace("BHOLD", bip); + trace_xfs_trans_bhold(bip); } /* @@ -714,19 +553,17 @@ void xfs_trans_bhold_release(xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + xfs_buf_log_item_t *bip = bp->b_fspriv; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); + bip->bli_flags &= ~XFS_BLI_HOLD; - xfs_buf_item_trace("BHOLD RELEASE", bip); + trace_xfs_trans_bhold_release(bip); } /* @@ -744,15 +581,13 @@ xfs_trans_log_buf(xfs_trans_t *tp, uint first, uint last) { - xfs_buf_log_item_t *bip; - xfs_log_item_desc_t *lidp; + xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT((first <= last) && (last < XFS_BUF_COUNT(bp))); - ASSERT((XFS_BUF_IODONE_FUNC(bp) == NULL) || - (XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks)); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(bp->b_iodone == NULL || + bp->b_iodone == xfs_buf_iodone_callbacks); /* * Mark the buffer as needing to be written out eventually, @@ -764,13 +599,13 @@ xfs_trans_log_buf(xfs_trans_t *tp, * inside the b_bdstrat callback so that this won't get written to * disk. */ - XFS_BUF_DELAYWRITE(bp); XFS_BUF_DONE(bp); - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*))xfs_buf_iodone; + bp->b_iodone = xfs_buf_iodone_callbacks; + bip->bli_item.li_cb = xfs_buf_iodone; + + trace_xfs_trans_log_buf(bip); /* * If we invalidated the buffer within this transaction, then @@ -779,143 +614,128 @@ xfs_trans_log_buf(xfs_trans_t *tp, * because we have a reference to the buffer this entire time. */ if (bip->bli_flags & XFS_BLI_STALE) { - xfs_buf_item_trace("BLOG UNSTALE", bip); bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(XFS_BUF_ISSTALE(bp)); XFS_BUF_UNSTALE(bp); - bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; - lidp->lid_flags &= ~XFS_LID_BUF_STALE; - bip->bli_flags |= XFS_BLI_LOGGED; - xfs_buf_item_log(bip, first, last); - xfs_buf_item_trace("BLOG", bip); + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + + /* + * If we have an ordered buffer we are not logging any dirty range but + * it still needs to be marked dirty and that it has been logged. + */ + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; + if (!(bip->bli_flags & XFS_BLI_ORDERED)) + xfs_buf_item_log(bip, first, last); } /* - * This called to invalidate a buffer that is being used within - * a transaction. Typically this is because the blocks in the - * buffer are being freed, so we need to prevent it from being - * written out when we're done. Allowing it to be written again - * might overwrite data in the free blocks if they are reallocated - * to a file. + * Invalidate a buffer that is being used within a transaction. * - * We prevent the buffer from being written out by clearing the - * B_DELWRI flag. We can't always - * get rid of the buf log item at this point, though, because - * the buffer may still be pinned by another transaction. If that - * is the case, then we'll wait until the buffer is committed to - * disk for the last time (we can tell by the ref count) and - * free it in xfs_buf_item_unpin(). Until it is cleaned up we - * will keep the buffer locked so that the buffer and buf log item - * are not reused. + * Typically this is because the blocks in the buffer are being freed, so we + * need to prevent it from being written out when we're done. Allowing it + * to be written again might overwrite data in the free blocks if they are + * reallocated to a file. + * + * We prevent the buffer from being written out by marking it stale. We can't + * get rid of the buf log item at this point because the buffer may still be + * pinned by another transaction. If that is the case, then we'll wait until + * the buffer is committed to disk for the last time (we can tell by the ref + * count) and free it in xfs_buf_item_unpin(). Until that happens we will + * keep the buffer locked so that the buffer and buf log item are not reused. + * + * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log + * the buf item. This will be used at recovery time to determine that copies + * of the buffer in the log before this should not be replayed. + * + * We mark the item descriptor and the transaction dirty so that we'll hold + * the buffer until after the commit. + * + * Since we're invalidating the buffer, we also clear the state about which + * parts of the buffer have been logged. We also clear the flag indicating + * that this is an inode buffer since the data in the buffer will no longer + * be valid. + * + * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + xfs_buf_log_item_t *bip = bp->b_fspriv; + int i; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); - ASSERT(lidp != NULL); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + trace_xfs_trans_binval(bip); + if (bip->bli_flags & XFS_BLI_STALE) { /* * If the buffer is already invalidated, then * just return. */ - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); - ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); - ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); - ASSERT(lidp->lid_flags & XFS_LID_DIRTY); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); + ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); + ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); + ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); - xfs_buftrace("XFS_BINVAL RECUR", bp); - xfs_buf_item_trace("BINVAL RECUR", bip); return; } - /* - * Clear the dirty bit in the buffer and set the STALE flag - * in the buf log item. The STALE flag will be used in - * xfs_buf_item_unpin() to determine if it should clean up - * when the last reference to the buf item is given up. - * We set the XFS_BLI_CANCEL flag in the buf log format structure - * and log the buf item. This will be used at recovery time - * to determine that copies of the buffer in the log before - * this should not be replayed. - * We mark the item descriptor and the transaction dirty so - * that we'll hold the buffer until after the commit. - * - * Since we're invalidating the buffer, we also clear the state - * about which parts of the buffer have been logged. We also - * clear the flag indicating that this is an inode buffer since - * the data in the buffer will no longer be valid. - * - * We set the stale bit in the buffer as well since we're getting - * rid of it. - */ - XFS_BUF_UNDELAYWRITE(bp); - XFS_BUF_STALE(bp); + xfs_buf_stale(bp); + bip->bli_flags |= XFS_BLI_STALE; - bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); - bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; - bip->bli_format.blf_flags |= XFS_BLI_CANCEL; - memset((char *)(bip->bli_format.blf_data_map), 0, - (bip->bli_format.blf_map_size * sizeof(uint))); - lidp->lid_flags |= XFS_LID_DIRTY|XFS_LID_BUF_STALE; + bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); + bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; + bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; + bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; + for (i = 0; i < bip->bli_format_count; i++) { + memset(bip->bli_formats[i].blf_data_map, 0, + (bip->bli_formats[i].blf_map_size * sizeof(uint))); + } + bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY; - xfs_buftrace("XFS_BINVAL", bp); - xfs_buf_item_trace("BINVAL", bip); } /* - * This call is used to indicate that the buffer contains on-disk - * inodes which must be handled specially during recovery. They - * require special handling because only the di_next_unlinked from - * the inodes in the buffer should be recovered. The rest of the - * data in the buffer is logged via the inodes themselves. + * This call is used to indicate that the buffer contains on-disk inodes which + * must be handled specially during recovery. They require special handling + * because only the di_next_unlinked from the inodes in the buffer should be + * recovered. The rest of the data in the buffer is logged via the inodes + * themselves. * - * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log - * format structure so that we'll know what to do at recovery time. + * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be + * transferred to the buffer's log format structure so that we'll know what to + * do at recovery time. */ -/* ARGSUSED */ void xfs_trans_inode_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + xfs_buf_log_item_t *bip = bp->b_fspriv; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; + bip->bli_flags |= XFS_BLI_INODE_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets - * special processing during unpin - where any inodes + * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be @@ -926,22 +746,17 @@ xfs_trans_stale_inode_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + xfs_buf_log_item_t *bip = bp->b_fspriv; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; - bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) - xfs_buf_iodone; + bip->bli_item.li_cb = xfs_buf_iodone; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } - - /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is @@ -956,155 +771,113 @@ xfs_trans_inode_alloc_buf( xfs_trans_t *tp, xfs_buf_t *bp) { - xfs_buf_log_item_t *bip; + xfs_buf_log_item_t *bip = bp->b_fspriv; - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } - /* - * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of - * dquots. However, unlike in inode buffer recovery, dquot buffers get - * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). - * The only thing that makes dquot buffers different from regular - * buffers is that we must not replay dquot bufs when recovering - * if a _corresponding_ quotaoff has happened. We also have to distinguish - * between usr dquot bufs and grp dquot bufs, because usr and grp quotas - * can be turned off independently. + * Mark the buffer as ordered for this transaction. This means + * that the contents of the buffer are not recorded in the transaction + * but it is tracked in the AIL as though it was. This allows us + * to record logical changes in transactions rather than the physical + * changes we make to the buffer without changing writeback ordering + * constraints of metadata buffers. */ -/* ARGSUSED */ void -xfs_trans_dquot_buf( - xfs_trans_t *tp, - xfs_buf_t *bp, - uint type) +xfs_trans_ordered_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip; - - ASSERT(XFS_BUF_ISBUSY(bp)); - ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); - ASSERT(type == XFS_BLI_UDQUOT_BUF || - type == XFS_BLI_PDQUOT_BUF || - type == XFS_BLI_GDQUOT_BUF); + struct xfs_buf_log_item *bip = bp->b_fspriv; - bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); - bip->bli_format.blf_flags |= type; + bip->bli_flags |= XFS_BLI_ORDERED; + trace_xfs_buf_item_ordered(bip); } /* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Only check the first, embedded - * chunk, since we don't want to spend all day scanning large transactions. + * Set the type of the buffer for log recovery so that it can correctly identify + * and hence attach the correct buffer ops to the buffer after replay. */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match( - xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) +void +xfs_trans_buf_set_type( + struct xfs_trans *tp, + struct xfs_buf *bp, + enum xfs_blft type) { - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; + struct xfs_buf_log_item *bip = bp->b_fspriv; - bp = NULL; - len = BBTOB(len); - licp = &tp->t_items; - if (!XFS_LIC_ARE_ALL_FREE(licp)) { - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } + if (!tp) + return; - lidp = XFS_LIC_SLOT(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } + ASSERT(bp->b_transp == tp); + ASSERT(bip != NULL); + ASSERT(atomic_read(&bip->bli_refcount) > 0); - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - break; - } else { - bp = NULL; - } - } - } - return bp; + xfs_blft_to_flags(&bip->__bli_format, type); +} + +void +xfs_trans_buf_copy_type( + struct xfs_buf *dst_bp, + struct xfs_buf *src_bp) +{ + struct xfs_buf_log_item *sbip = src_bp->b_fspriv; + struct xfs_buf_log_item *dbip = dst_bp->b_fspriv; + enum xfs_blft type; + + type = xfs_blft_from_flags(&sbip->__bli_format); + xfs_blft_to_flags(&dbip->__bli_format, type); } /* - * Check to see if a buffer matching the given parameters is already - * a part of the given transaction. Check all the chunks, we - * want to be thorough. + * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of + * dquots. However, unlike in inode buffer recovery, dquot buffers get + * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). + * The only thing that makes dquot buffers different from regular + * buffers is that we must not replay dquot bufs when recovering + * if a _corresponding_ quotaoff has happened. We also have to distinguish + * between usr dquot bufs and grp dquot bufs, because usr and grp quotas + * can be turned off independently. */ -STATIC xfs_buf_t * -xfs_trans_buf_item_match_all( +/* ARGSUSED */ +void +xfs_trans_dquot_buf( xfs_trans_t *tp, - xfs_buftarg_t *target, - xfs_daddr_t blkno, - int len) + xfs_buf_t *bp, + uint type) { - xfs_log_item_chunk_t *licp; - xfs_log_item_desc_t *lidp; - xfs_buf_log_item_t *blip; - xfs_buf_t *bp; - int i; - - bp = NULL; - len = BBTOB(len); - for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) { - if (XFS_LIC_ARE_ALL_FREE(licp)) { - ASSERT(licp == &tp->t_items); - ASSERT(licp->lic_next == NULL); - return NULL; - } - for (i = 0; i < licp->lic_unused; i++) { - /* - * Skip unoccupied slots. - */ - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - lidp = XFS_LIC_SLOT(licp, i); - blip = (xfs_buf_log_item_t *)lidp->lid_item; - if (blip->bli_item.li_type != XFS_LI_BUF) { - continue; - } - - bp = blip->bli_buf; - if ((XFS_BUF_TARGET(bp) == target) && - (XFS_BUF_ADDR(bp) == blkno) && - (XFS_BUF_COUNT(bp) == len)) { - /* - * We found it. Break out and - * return the pointer to the buffer. - */ - return bp; - } - } + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(type == XFS_BLF_UDQUOT_BUF || + type == XFS_BLF_PDQUOT_BUF || + type == XFS_BLF_GDQUOT_BUF); + + bip->__bli_format.blf_flags |= type; + + switch (type) { + case XFS_BLF_UDQUOT_BUF: + type = XFS_BLFT_UDQUOT_BUF; + break; + case XFS_BLF_PDQUOT_BUF: + type = XFS_BLFT_PDQUOT_BUF; + break; + case XFS_BLF_GDQUOT_BUF: + type = XFS_BLFT_GDQUOT_BUF; + break; + default: + type = XFS_BLFT_UNKNOWN_BUF; + break; } - return NULL; + + xfs_trans_buf_set_type(tp, bp, type); } diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 0242e9666e8..41172861e85 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -17,37 +17,18 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_alloc.h" -#include "xfs_dmapi.h" -#include "xfs_quota.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_attr_sf.h" -#include "xfs_dir2_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_ialloc.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_bmap.h" -#include "xfs_rtalloc.h" #include "xfs_error.h" -#include "xfs_rw.h" -#include "xfs_acl.h" -#include "xfs_cap.h" -#include "xfs_mac.h" -#include "xfs_attr.h" -#include "xfs_buf_item.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" +#include "xfs_quota.h" #include "xfs_qm.h" STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *); @@ -62,20 +43,17 @@ xfs_trans_dqjoin( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_dq_logitem_t *lp; - - ASSERT(! XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp != tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_LOGITEM_INITD(dqp)); - lp = &dqp->q_logitem; + ASSERT(dqp->q_logitem.qli_dquot == dqp); /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(lp)); + xfs_trans_add_item(tp, &dqp->q_logitem.qli_item); /* - * Initialize i_transp so we can later determine if this dquot is + * Initialize d_transp so we can later determine if this dquot is * associated with this transaction. */ dqp->q_transp = tp; @@ -97,23 +75,18 @@ xfs_trans_log_dquot( xfs_trans_t *tp, xfs_dquot_t *dqp) { - xfs_log_item_desc_t *lidp; - - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); ASSERT(XFS_DQ_IS_LOCKED(dqp)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(&dqp->q_logitem)); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; } /* * Carry forward whatever is left of the quota blk reservation to * the spanky new transaction */ -STATIC void +void xfs_trans_dup_dqinfo( xfs_trans_t *otp, xfs_trans_t *ntp) @@ -126,8 +99,6 @@ xfs_trans_dup_dqinfo( return; xfs_trans_alloc_dqinfo(ntp); - oqa = otp->t_dqinfo->dqa_usrdquots; - nqa = ntp->t_dqinfo->dqa_usrdquots; /* * Because the quota blk reservation is carried forward, @@ -136,7 +107,9 @@ xfs_trans_dup_dqinfo( if(otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; - for (j = 0; j < 2; j++) { + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + oqa = otp->t_dqinfo->dqs[j]; + nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { if (oqa[i].qt_dquot == NULL) break; @@ -161,29 +134,24 @@ xfs_trans_dup_dqinfo( oq->qt_ino_res = oq->qt_ino_res_used; } - oqa = otp->t_dqinfo->dqa_grpdquots; - nqa = ntp->t_dqinfo->dqa_grpdquots; } } /* * Wrap around mod_dquot to account for both user and group quotas. */ -STATIC void +void xfs_trans_mod_dquot_byino( xfs_trans_t *tp, xfs_inode_t *ip, uint field, long delta) { - xfs_mount_t *mp; + xfs_mount_t *mp = tp->t_mountp; - ASSERT(tp); - mp = tp->t_mountp; - - if (!XFS_IS_QUOTA_ON(mp) || - ip->i_ino == mp->m_sb.sb_uquotino || - ip->i_ino == mp->m_sb.sb_gquotino) + if (!XFS_IS_QUOTA_RUNNING(mp) || + !XFS_IS_QUOTA_ON(mp) || + xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; if (tp->t_dqinfo == NULL) @@ -191,28 +159,36 @@ xfs_trans_mod_dquot_byino( if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot) (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta); - if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot) + if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot) (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta); + if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot) + (void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta); } -STATIC xfs_dqtrx_t * +STATIC struct xfs_dqtrx * xfs_trans_get_dqtrx( - xfs_trans_t *tp, - xfs_dquot_t *dqp) + struct xfs_trans *tp, + struct xfs_dquot *dqp) { - int i; - xfs_dqtrx_t *qa; + int i; + struct xfs_dqtrx *qa; + + if (XFS_QM_ISUDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_USR]; + else if (XFS_QM_ISGDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_GRP]; + else if (XFS_QM_ISPDQ(dqp)) + qa = tp->t_dqinfo->dqs[XFS_QM_TRANS_PRJ]; + else + return NULL; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { - qa = XFS_QM_DQP_TO_DQACCT(tp, dqp); - if (qa[i].qt_dquot == NULL || - qa[i].qt_dquot == dqp) { - return (&qa[i]); - } + qa[i].qt_dquot == dqp) + return &qa[i]; } - return (NULL); + return NULL; } /* @@ -231,6 +207,7 @@ xfs_trans_mod_dquot( xfs_dqtrx_t *qtrx; ASSERT(tp); + ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); qtrx = NULL; if (tp->t_dqinfo == NULL) @@ -316,11 +293,10 @@ xfs_trans_mod_dquot( /* - * Given an array of dqtrx structures, lock all the dquots associated - * and join them to the transaction, provided they have been modified. - * We know that the highest number of dquots (of one type - usr OR grp), - * involved in a transaction is 2 and that both usr and grp combined - 3. - * So, we don't attempt to make this very generic. + * Given an array of dqtrx structures, lock all the dquots associated and join + * them to the transaction, provided they have been modified. We know that the + * highest number of dquots of one type - usr, grp and prj - involved in a + * transaction is 3 so we don't need to make this very generic. */ STATIC void xfs_trans_dqlockedjoin( @@ -348,27 +324,25 @@ xfs_trans_dqlockedjoin( * Unreserve just the reservations done by this transaction. * dquot is still left locked at exit. */ -STATIC void +void xfs_trans_apply_dquot_deltas( - xfs_trans_t *tp) + struct xfs_trans *tp) { int i, j; - xfs_dquot_t *dqp; - xfs_dqtrx_t *qtrx, *qa; - xfs_disk_dquot_t *d; + struct xfs_dquot *dqp; + struct xfs_dqtrx *qtrx, *qa; + struct xfs_disk_dquot *d; long totalbdelta; long totalrtbdelta; - if (! (tp->t_flags & XFS_TRANS_DQ_DIRTY)) + if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; ASSERT(tp->t_dqinfo); - qa = tp->t_dqinfo->dqa_usrdquots; - for (j = 0; j < 2; j++) { - if (qa[0].qt_dquot == NULL) { - qa = tp->t_dqinfo->dqa_grpdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; + if (qa[0].qt_dquot == NULL) continue; - } /* * Lock all of the dquots and join them to the transaction. @@ -385,7 +359,7 @@ xfs_trans_apply_dquot_deltas( break; ASSERT(XFS_DQ_IS_LOCKED(dqp)); - ASSERT(XFS_DQ_IS_ADDEDTO_TRX(tp, dqp)); + ASSERT(dqp->q_transp == tp); /* * adjust the actual number of blocks used @@ -409,34 +383,34 @@ xfs_trans_apply_dquot_deltas( qtrx->qt_delbcnt_delta; totalrtbdelta = qtrx->qt_rtbcount_delta + qtrx->qt_delrtb_delta; -#ifdef QUOTADEBUG +#ifdef DEBUG if (totalbdelta < 0) ASSERT(be64_to_cpu(d->d_bcount) >= - (xfs_qcnt_t) -totalbdelta); + -totalbdelta); if (totalrtbdelta < 0) ASSERT(be64_to_cpu(d->d_rtbcount) >= - (xfs_qcnt_t) -totalrtbdelta); + -totalrtbdelta); if (qtrx->qt_icount_delta < 0) ASSERT(be64_to_cpu(d->d_icount) >= - (xfs_qcnt_t) -qtrx->qt_icount_delta); + -qtrx->qt_icount_delta); #endif if (totalbdelta) - be64_add(&d->d_bcount, (xfs_qcnt_t)totalbdelta); + be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta); if (qtrx->qt_icount_delta) - be64_add(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); + be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta); if (totalrtbdelta) - be64_add(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); + be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta); /* * Get any default limits in use. * Start/reset the timer(s) if needed. */ if (d->d_id) { - xfs_qm_adjust_dqlimits(tp->t_mountp, d); + xfs_qm_adjust_dqlimits(tp->t_mountp, dqp); xfs_qm_adjust_dqtimers(tp->t_mountp, d); } @@ -519,10 +493,6 @@ xfs_trans_apply_dquot_deltas( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); } - /* - * Do the group quotas next - */ - qa = tp->t_dqinfo->dqa_grpdquots; } } @@ -533,21 +503,21 @@ xfs_trans_apply_dquot_deltas( * we simply throw those away, since that's the expected behavior * when a transaction is curtailed without a commit. */ -STATIC void +void xfs_trans_unreserve_and_mod_dquots( xfs_trans_t *tp) { int i, j; xfs_dquot_t *dqp; xfs_dqtrx_t *qtrx, *qa; - boolean_t locked; + bool locked; if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY)) return; - qa = tp->t_dqinfo->dqa_usrdquots; + for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { + qa = tp->t_dqinfo->dqs[j]; - for (j = 0; j < 2; j++) { for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { qtrx = &qa[i]; /* @@ -561,17 +531,17 @@ xfs_trans_unreserve_and_mod_dquots( * about the number of blocks used field, or deltas. * Also we don't bother to zero the fields. */ - locked = B_FALSE; + locked = false; if (qtrx->qt_blk_res) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; dqp->q_res_bcount -= (xfs_qcnt_t)qtrx->qt_blk_res; } if (qtrx->qt_ino_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_icount -= (xfs_qcnt_t)qtrx->qt_ino_res; @@ -580,7 +550,7 @@ xfs_trans_unreserve_and_mod_dquots( if (qtrx->qt_rtblk_res) { if (!locked) { xfs_dqlock(dqp); - locked = B_TRUE; + locked = true; } dqp->q_res_rtbcount -= (xfs_qcnt_t)qtrx->qt_rtblk_res; @@ -589,16 +559,23 @@ xfs_trans_unreserve_and_mod_dquots( xfs_dqunlock(dqp); } - qa = tp->t_dqinfo->dqa_grpdquots; } } -STATIC int -xfs_quota_error(uint flags) +STATIC void +xfs_quota_warn( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + int type) { - if (flags & XFS_QMOPT_ENOSPC) - return ENOSPC; - return EDQUOT; + /* no warnings for project quotas - we just return ENOSPC later */ + if (dqp->dq_flags & XFS_DQ_PROJ) + return; + quota_send_warning(make_kqid(&init_user_ns, + (dqp->dq_flags & XFS_DQ_USER) ? + USRQUOTA : GRPQUOTA, + be32_to_cpu(dqp->q_core.d_id)), + mp->m_super->s_dev, type); } /* @@ -616,20 +593,18 @@ xfs_trans_dqresv( long ninos, uint flags) { - int error; xfs_qcnt_t hardlimit; xfs_qcnt_t softlimit; time_t timer; xfs_qwarncnt_t warns; xfs_qwarncnt_t warnlimit; - xfs_qcnt_t count; + xfs_qcnt_t total_count; xfs_qcnt_t *resbcountp; xfs_quotainfo_t *q = mp->m_quotainfo; - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqlock(dqp); - } - ASSERT(XFS_DQ_IS_LOCKED(dqp)); + + xfs_dqlock(dqp); + if (flags & XFS_TRANS_DQ_RES_BLKS) { hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit); if (!hardlimit) @@ -639,7 +614,7 @@ xfs_trans_dqresv( softlimit = q->qi_bsoftlimit; timer = be32_to_cpu(dqp->q_core.d_btimer); warns = be16_to_cpu(dqp->q_core.d_bwarns); - warnlimit = XFS_QI_BWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit; resbcountp = &dqp->q_res_bcount; } else { ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS); @@ -651,59 +626,61 @@ xfs_trans_dqresv( softlimit = q->qi_rtbsoftlimit; timer = be32_to_cpu(dqp->q_core.d_rtbtimer); warns = be16_to_cpu(dqp->q_core.d_rtbwarns); - warnlimit = XFS_QI_RTBWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit; resbcountp = &dqp->q_res_rtbcount; } - error = 0; if ((flags & XFS_QMOPT_FORCE_RES) == 0 && dqp->q_core.d_id && - XFS_IS_QUOTA_ENFORCED(dqp->q_mount)) { -#ifdef QUOTADEBUG - cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" - " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); -#endif + ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) || + (XFS_IS_GQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISGDQ(dqp)) || + (XFS_IS_PQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISPDQ(dqp)))) { if (nblks > 0) { /* * dquot is locked already. See if we'd go over the * hardlimit or exceed the timelimit if we allocate * nblks. */ - if (hardlimit > 0ULL && - (hardlimit <= nblks + *resbcountp)) { - error = xfs_quota_error(flags); + total_count = *resbcountp + nblks; + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN); goto error_return; } - - if (softlimit > 0ULL && - (softlimit <= nblks + *resbcountp)) { + if (softlimit && total_count > softlimit) { if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); + xfs_quota_warn(mp, dqp, + QUOTA_NL_BSOFTLONGWARN); goto error_return; } + + xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN); } } if (ninos > 0) { - count = be64_to_cpu(dqp->q_core.d_icount); + total_count = be64_to_cpu(dqp->q_core.d_icount) + ninos; timer = be32_to_cpu(dqp->q_core.d_itimer); warns = be16_to_cpu(dqp->q_core.d_iwarns); - warnlimit = XFS_QI_IWARNLIMIT(dqp->q_mount); + warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit; hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit); if (!hardlimit) hardlimit = q->qi_ihardlimit; softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit); if (!softlimit) softlimit = q->qi_isoftlimit; - if (hardlimit > 0ULL && count >= hardlimit) { - error = xfs_quota_error(flags); + + if (hardlimit && total_count > hardlimit) { + xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN); goto error_return; - } else if (softlimit > 0ULL && count >= softlimit) { - if ((timer != 0 && get_seconds() > timer) || + } + if (softlimit && total_count > softlimit) { + if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = xfs_quota_error(flags); + xfs_quota_warn(mp, dqp, + QUOTA_NL_ISOFTLONGWARN); goto error_return; } + xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN); } } } @@ -739,22 +716,24 @@ xfs_trans_dqresv( ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount)); ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount)); + xfs_dqunlock(dqp); + return 0; + error_return: - if (! (flags & XFS_QMOPT_DQLOCK)) { - xfs_dqunlock(dqp); - } - return (error); + xfs_dqunlock(dqp); + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; } /* * Given dquot(s), make disk block and/or inode reservations against them. - * The fact that this does the reservation against both the usr and - * grp/prj quotas is important, because this follows a both-or-nothing + * The fact that this does the reservation against user, group and + * project quotas is important, because this follows a all-or-nothing * approach. * - * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. - * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks @@ -762,17 +741,18 @@ error_return: */ int xfs_trans_reserve_quota_bydquots( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_dquot_t *udqp, - xfs_dquot_t *gdqp, - long nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_mount *mp, + struct xfs_dquot *udqp, + struct xfs_dquot *gdqp, + struct xfs_dquot *pdqp, + long nblks, + long ninos, + uint flags) { - int resvd = 0, error; + int error; - if (!XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; if (tp && tp->t_dqinfo == NULL) @@ -785,28 +765,34 @@ xfs_trans_reserve_quota_bydquots( (flags & ~XFS_QMOPT_ENOSPC)); if (error) return error; - resvd = 1; } if (gdqp) { error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); - if (error) { - /* - * can't do it, so backout previous reservation - */ - if (resvd) { - flags |= XFS_QMOPT_FORCE_RES; - xfs_trans_dqresv(tp, mp, udqp, - -nblks, -ninos, flags); - } - return error; - } + if (error) + goto unwind_usr; + } + + if (pdqp) { + error = xfs_trans_dqresv(tp, mp, pdqp, nblks, ninos, flags); + if (error) + goto unwind_grp; } /* * Didn't change anything critical, so, no need to log */ return 0; + +unwind_grp: + flags |= XFS_QMOPT_FORCE_RES; + if (gdqp) + xfs_trans_dqresv(tp, mp, gdqp, -nblks, -ninos, flags); +unwind_usr: + flags |= XFS_QMOPT_FORCE_RES; + if (udqp) + xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); + return error; } @@ -815,27 +801,24 @@ xfs_trans_reserve_quota_bydquots( * This doesn't change the actual usage, just the reservation. * The inode sent in is locked. */ -STATIC int +int xfs_trans_reserve_quota_nblks( - xfs_trans_t *tp, - xfs_mount_t *mp, - xfs_inode_t *ip, - long nblks, - long ninos, - uint flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + long nblks, + long ninos, + uint flags) { - int error; + struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) return 0; if (XFS_IS_PQUOTA_ON(mp)) flags |= XFS_QMOPT_ENOSPC; - ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); - ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); + ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); - ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == XFS_TRANS_DQ_RES_RTBLKS || (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == @@ -844,11 +827,10 @@ xfs_trans_reserve_quota_nblks( /* * Reserve nblks against these dquots, with trans as the mediator. */ - error = xfs_trans_reserve_quota_bydquots(tp, mp, - ip->i_udquot, ip->i_gdquot, - nblks, ninos, - flags); - return error; + return xfs_trans_reserve_quota_bydquots(tp, mp, + ip->i_udquot, ip->i_gdquot, + ip->i_pdquot, + nblks, ninos, flags); } /* @@ -870,9 +852,8 @@ xfs_trans_get_qoff_item( /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)q); - - return (q); + xfs_trans_add_item(tp, &q->qql_item); + return q; } @@ -886,38 +867,23 @@ xfs_trans_log_quotaoff_item( xfs_trans_t *tp, xfs_qoff_logitem_t *qlp) { - xfs_log_item_desc_t *lidp; - - lidp = xfs_trans_find_item(tp, (xfs_log_item_t *)qlp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; } STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - (tp)->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP); + tp->t_dqinfo = kmem_zone_zalloc(xfs_qm_dqtrxzone, KM_SLEEP); } -STATIC void +void xfs_trans_free_dqinfo( xfs_trans_t *tp) { if (!tp->t_dqinfo) return; - kmem_zone_free(xfs_Gqm->qm_dqtrxzone, (tp)->t_dqinfo); - (tp)->t_dqinfo = NULL; + kmem_zone_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + tp->t_dqinfo = NULL; } - -xfs_dqtrxops_t xfs_trans_dquot_ops = { - .qo_dup_dqinfo = xfs_trans_dup_dqinfo, - .qo_free_dqinfo = xfs_trans_free_dqinfo, - .qo_mod_dquot_byino = xfs_trans_mod_dquot_byino, - .qo_apply_dquot_deltas = xfs_trans_apply_dquot_deltas, - .qo_reserve_quota_nblks = xfs_trans_reserve_quota_nblks, - .qo_reserve_quota_bydquots = xfs_trans_reserve_quota_bydquots, - .qo_unreserve_and_mod_dquots = xfs_trans_unreserve_and_mod_dquots, -}; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index b290270dd4a..47978ba89da 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -17,13 +17,13 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" -#include "xfs_dmapi.h" +#include "xfs_ag.h" #include "xfs_mount.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_extfree_item.h" @@ -48,9 +48,8 @@ xfs_trans_get_efi(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efip); - - return (efip); + xfs_trans_add_item(tp, &efip->efi_item); + return efip; } /* @@ -64,22 +63,22 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efip); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; - next_extent = efip->efi_next_extent; + /* + * atomic_inc_return gives us the value after the increment; + * we want to use it as an array index so we need to subtract 1 from + * it. + */ + next_extent = atomic_inc_return(&efip->efi_next_extent) - 1; ASSERT(next_extent < efip->efi_format.efi_nextents); extp = &(efip->efi_format.efi_extents[next_extent]); extp->ext_start = start_block; extp->ext_len = ext_len; - efip->efi_next_extent++; } @@ -105,9 +104,8 @@ xfs_trans_get_efd(xfs_trans_t *tp, /* * Get a log_item_desc to point at the new item. */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)efdp); - - return (efdp); + xfs_trans_add_item(tp, &efdp->efd_item); + return efdp; } /* @@ -121,15 +119,11 @@ xfs_trans_log_efd_extent(xfs_trans_t *tp, xfs_fsblock_t start_block, xfs_extlen_t ext_len) { - xfs_log_item_desc_t *lidp; uint next_extent; xfs_extent_t *extp; - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)efdp); - ASSERT(lidp != NULL); - tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index b8db1d5cde5..50c3f561428 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -17,226 +17,79 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" #include "xfs_inode.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_inode_item.h" - -#ifdef XFS_TRANS_DEBUG -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip); -#else -#define xfs_trans_inode_broot_debug(ip) -#endif - +#include "xfs_trace.h" /* - * Get and lock the inode for the caller if it is not already - * locked within the given transaction. If it is already locked - * within the transaction, just increment its lock recursion count - * and return a pointer to it. - * - * For an inode to be locked in a transaction, the inode lock, as - * opposed to the io lock, must be taken exclusively. This ensures - * that the inode can be involved in only 1 transaction at a time. - * Lock recursion is handled on the io lock, but only for lock modes - * of equal or lesser strength. That is, you can recur on the io lock - * held EXCL with a SHARED request but not vice versa. Also, if - * the inode is already a part of the transaction then you cannot - * go from not holding the io lock to having it EXCL or SHARED. + * Add a locked inode to the transaction. * - * Use the inode cache routine xfs_inode_incore() to find the inode - * if it is already owned by this transaction. - * - * If we don't already own the inode, use xfs_iget() to get it. - * Since the inode log item structure is embedded in the incore - * inode structure and is initialized when the inode is brought - * into memory, there is nothing to do with it here. - * - * If the given transaction pointer is NULL, just call xfs_iget(). - * This simplifies code which must handle both cases. - */ -int -xfs_trans_iget( - xfs_mount_t *mp, - xfs_trans_t *tp, - xfs_ino_t ino, - uint flags, - uint lock_flags, - xfs_inode_t **ipp) -{ - int error; - xfs_inode_t *ip; - xfs_inode_log_item_t *iip; - - /* - * If the transaction pointer is NULL, just call the normal - * xfs_iget(). - */ - if (tp == NULL) - return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0); - - /* - * If we find the inode in core with this transaction - * pointer in its i_transp field, then we know we already - * have it locked. In this case we just increment the lock - * recursion count and return the inode to the caller. - * Assert that the inode is already locked in the mode requested - * by the caller. We cannot do lock promotions yet, so - * die if someone gets this wrong. - */ - if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) { - /* - * Make sure that the inode lock is held EXCL and - * that the io lock is never upgraded when the inode - * is already a part of the transaction. - */ - ASSERT(ip->i_itemp != NULL); - ASSERT(lock_flags & XFS_ILOCK_EXCL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - ismrlocked(&ip->i_iolock, MR_UPDATE)); - ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS))); - ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); - - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ip->i_itemp->ili_iolock_recur++; - } - if (lock_flags & XFS_ILOCK_EXCL) { - ip->i_itemp->ili_ilock_recur++; - } - *ipp = ip; - return 0; - } - - ASSERT(lock_flags & XFS_ILOCK_EXCL); - error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0); - if (error) { - return error; - } - ASSERT(ip != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - if (ip->i_itemp == NULL) - xfs_inode_item_init(ip, mp); - iip = ip->i_itemp; - (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip)); - - xfs_trans_inode_broot_debug(ip); - - /* - * If the IO lock has been acquired, mark that in - * the inode log item so we'll know to unlock it - * when the transaction commits. - */ - ASSERT(iip->ili_flags == 0); - if (lock_flags & XFS_IOLOCK_EXCL) { - iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; - } - - /* - * Initialize i_transp so we can find it with xfs_inode_incore() - * above. - */ - ip->i_transp = tp; - - *ipp = ip; - return 0; -} - -/* - * Add the locked inode to the transaction. - * The inode must be locked, and it cannot be associated with any - * transaction. The caller must specify the locks already held - * on the inode. + * The inode must be locked, and it cannot be associated with any transaction. + * If lock_flags is non-zero the inode will be unlocked on transaction commit. */ void xfs_trans_ijoin( - xfs_trans_t *tp, - xfs_inode_t *ip, - uint lock_flags) + struct xfs_trans *tp, + struct xfs_inode *ip, + uint lock_flags) { xfs_inode_log_item_t *iip; - ASSERT(ip->i_transp == NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(lock_flags & XFS_ILOCK_EXCL); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); iip = ip->i_itemp; - ASSERT(iip->ili_flags == 0); - ASSERT(iip->ili_ilock_recur == 0); - ASSERT(iip->ili_iolock_recur == 0); - - /* - * Get a log_item_desc to point at the new item. - */ - (void) xfs_trans_add_item(tp, (xfs_log_item_t*)(iip)); - xfs_trans_inode_broot_debug(ip); + ASSERT(iip->ili_lock_flags == 0); + iip->ili_lock_flags = lock_flags; /* - * If the IO lock is already held, mark that in the inode log item. - */ - if (lock_flags & XFS_IOLOCK_EXCL) { - iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL; - } else if (lock_flags & XFS_IOLOCK_SHARED) { - iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED; - } - - /* - * Initialize i_transp so we can find it with xfs_inode_incore() - * in xfs_trans_iget() above. + * Get a log_item_desc to point at the new item. */ - ip->i_transp = tp; + xfs_trans_add_item(tp, &iip->ili_item); } - - /* - * Mark the inode as not needing to be unlocked when the inode item's - * IOP_UNLOCK() routine is called. The inode must already be locked - * and associated with the given transaction. + * Transactional inode timestamp update. Requires the inode to be locked and + * joined to the transaction supplied. Relies on the transaction subsystem to + * track dirty state and update/writeback the inode accordingly. */ -/*ARGSUSED*/ void -xfs_trans_ihold( - xfs_trans_t *tp, - xfs_inode_t *ip) +xfs_trans_ichgtime( + struct xfs_trans *tp, + struct xfs_inode *ip, + int flags) { - ASSERT(ip->i_transp == tp); - ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + struct inode *inode = VFS_I(ip); + timespec_t tv; - ip->i_itemp->ili_flags |= XFS_ILI_HOLD; -} + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + tv = current_fs_time(inode->i_sb); + + if ((flags & XFS_ICHGTIME_MOD) && + !timespec_equal(&inode->i_mtime, &tv)) { + inode->i_mtime = tv; + ip->i_d.di_mtime.t_sec = tv.tv_sec; + ip->i_d.di_mtime.t_nsec = tv.tv_nsec; + } + if ((flags & XFS_ICHGTIME_CHG) && + !timespec_equal(&inode->i_ctime, &tv)) { + inode->i_ctime = tv; + ip->i_d.di_ctime.t_sec = tv.tv_sec; + ip->i_d.di_ctime.t_nsec = tv.tv_nsec; + } +} /* * This is called to mark the fields indicated in fieldmask as needing @@ -253,56 +106,32 @@ xfs_trans_log_inode( xfs_inode_t *ip, uint flags) { - xfs_log_item_desc_t *lidp; - - ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); - ASSERT(lidp != NULL); + /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. We don't use + * inode_inc_version() because there is no need for extra locking around + * i_version as we already hold the inode locked exclusively for + * metadata modification. + */ + if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + IS_I_VERSION(VFS_I(ip))) { + ip->i_d.di_changecount = ++VFS_I(ip)->i_version; + flags |= XFS_ILOG_CORE; + } tp->t_flags |= XFS_TRANS_DIRTY; - lidp->lid_flags |= XFS_LID_DIRTY; + ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. * This is to coordinate with the xfs_iflush() and xfs_iflush_done() - * routines in the eventual clearing of the ilf_fields bits. + * routines in the eventual clearing of the ili_fields bits. * See the big comment in xfs_iflush() for an explanation of * this coordination mechanism. */ flags |= ip->i_itemp->ili_last_fields; - ip->i_itemp->ili_format.ilf_fields |= flags; -} - -#ifdef XFS_TRANS_DEBUG -/* - * Keep track of the state of the inode btree root to make sure we - * log it properly. - */ -STATIC void -xfs_trans_inode_broot_debug( - xfs_inode_t *ip) -{ - xfs_inode_log_item_t *iip; - - ASSERT(ip->i_itemp != NULL); - iip = ip->i_itemp; - if (iip->ili_root_size != 0) { - ASSERT(iip->ili_orig_root != NULL); - kmem_free(iip->ili_orig_root, iip->ili_root_size); - iip->ili_root_size = 0; - iip->ili_orig_root = NULL; - } - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { - ASSERT((ip->i_df.if_broot != NULL) && - (ip->i_df.if_broot_bytes > 0)); - iip->ili_root_size = ip->i_df.if_broot_bytes; - iip->ili_orig_root = - (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP); - memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot), - iip->ili_root_size); - } + ip->i_itemp->ili_fields |= flags; } -#endif diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c deleted file mode 100644 index 2912aac07c7..00000000000 --- a/fs/xfs/xfs_trans_item.c +++ /dev/null @@ -1,538 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" - -STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *, - int, int, xfs_lsn_t); - -/* - * This is called to add the given log item to the transaction's - * list of log items. It must find a free log item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to item descriptor used to point - * to the new item. The log item will now point to its new descriptor - * with its li_desc field. - */ -xfs_log_item_desc_t * -xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_chunk_t *licp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_items_free == 0) { - licp = (xfs_log_item_chunk_t*) - kmem_alloc(sizeof(xfs_log_item_chunk_t), KM_SLEEP); - ASSERT(licp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LIC_INIT(licp); - XFS_LIC_CLAIM(licp, 0); - licp->lic_unused = 1; - XFS_LIC_INIT_SLOT(licp, 0); - lidp = XFS_LIC_SLOT(licp, 0); - - /* - * Link in the new chunk and update the free count. - */ - licp->lic_next = tp->t_items.lic_next; - tp->t_items.lic_next = licp; - tp->t_items_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - return lidp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - licp = &tp->t_items; - while (licp != NULL) { - if (XFS_LIC_VACANCY(licp)) { - if (licp->lic_unused <= XFS_LIC_MAX_SLOT) { - i = licp->lic_unused; - ASSERT(XFS_LIC_ISFREE(licp, i)); - break; - } - for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) { - if (XFS_LIC_ISFREE(licp, i)) - break; - } - ASSERT(i <= XFS_LIC_MAX_SLOT); - break; - } - licp = licp->lic_next; - } - ASSERT(licp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LIC_CLAIM(licp, i); - if (licp->lic_unused <= i) { - licp->lic_unused = i + 1; - XFS_LIC_INIT_SLOT(licp, i); - } - lidp = XFS_LIC_SLOT(licp, i); - tp->t_items_free--; - lidp->lid_item = lip; - lidp->lid_flags = 0; - lidp->lid_size = 0; - lip->li_desc = lidp; - lip->li_mountp = tp->t_mountp; - return lidp; -} - -/* - * Free the given descriptor. - * - * This requires setting the bit in the chunk's free mask corresponding - * to the given slot. - */ -void -xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - uint slot; - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t **licpp; - - slot = XFS_LIC_DESC_TO_SLOT(lidp); - licp = XFS_LIC_DESC_TO_CHUNK(lidp); - XFS_LIC_RELSE(licp, slot); - lidp->lid_item->li_desc = NULL; - tp->t_items_free++; - - /* - * If there are no more used items in the chunk and this is not - * the chunk embedded in the transaction structure, then free - * the chunk. First pull it from the chunk list and then - * free it back to the heap. We didn't bother with a doubly - * linked list here because the lists should be very short - * and this is not a performance path. It's better to save - * the memory of the extra pointer. - * - * Also decrement the transaction structure's count of free items - * by the number in a chunk since we are freeing an empty chunk. - */ - if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) { - licpp = &(tp->t_items.lic_next); - while (*licpp != licp) { - ASSERT(*licpp != NULL); - licpp = &((*licpp)->lic_next); - } - *licpp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); - tp->t_items_free -= XFS_LIC_NUM_SLOTS; - } -} - -/* - * This is called to find the descriptor corresponding to the given - * log item. It returns a pointer to the descriptor. - * The log item MUST have a corresponding descriptor in the given - * transaction. This routine does not return NULL, it panics. - * - * The descriptor pointer is kept in the log item's li_desc field. - * Just return it. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_find_item(xfs_trans_t *tp, xfs_log_item_t *lip) -{ - ASSERT(lip->li_desc != NULL); - - return lip->li_desc; -} - - -/* - * Return a pointer to the first descriptor in the chunk list. - * This does not return NULL if there are none, it panics. - * - * The first descriptor must be in either the first or second chunk. - * This is because the only chunk allowed to be empty is the first. - * All others are freed when they become empty. - * - * At some point this and xfs_trans_next_item() should be optimized - * to quickly look at the mask to determine if there is anything to - * look at. - */ -xfs_log_item_desc_t * -xfs_trans_first_item(xfs_trans_t *tp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = &tp->t_items; - /* - * If it's not in the first chunk, skip to the second. - */ - if (XFS_LIC_ARE_ALL_FREE(licp)) { - licp = licp->lic_next; - } - - /* - * Return the first non-free descriptor in the chunk. - */ - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - return XFS_LIC_SLOT(licp, i); - } - cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item"); - return NULL; -} - - -/* - * Given a descriptor, return the next descriptor in the chunk list. - * This returns NULL if there are no more used descriptors in the list. - * - * We do this by first locating the chunk in which the descriptor resides, - * and then scanning forward in the chunk and the list for the next - * used descriptor. - */ -/*ARGSUSED*/ -xfs_log_item_desc_t * -xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp) -{ - xfs_log_item_chunk_t *licp; - int i; - - licp = XFS_LIC_DESC_TO_CHUNK(lidp); - - /* - * First search the rest of the chunk. The for loop keeps us - * from referencing things beyond the end of the chunk. - */ - for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - return XFS_LIC_SLOT(licp, i); - } - - /* - * Now search the next chunk. It must be there, because the - * next chunk would have been freed if it were empty. - * If there is no next chunk, return NULL. - */ - if (licp->lic_next == NULL) { - return NULL; - } - - licp = licp->lic_next; - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); - for (i = 0; i < licp->lic_unused; i++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - - return XFS_LIC_SLOT(licp, i); - } - ASSERT(0); - /* NOTREACHED */ - return NULL; /* keep gcc quite */ -} - -/* - * This is called to unlock all of the items of a transaction and to free - * all the descriptors of that transaction. - * - * It walks the list of descriptors and unlocks each item. It frees - * each chunk except that embedded in the transaction as it goes along. - */ -void -xfs_trans_free_items( - xfs_trans_t *tp, - int flags) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - int abort; - - abort = flags & XFS_TRANS_ABORT; - licp = &tp->t_items; - /* - * Special case the embedded chunk so we don't free it below. - */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); - XFS_LIC_ALL_FREE(licp); - licp->lic_unused = 0; - } - licp = licp->lic_next; - - /* - * Unlock each item in each chunk and free the chunks. - */ - while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); - (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); - next_licp = licp->lic_next; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); - licp = next_licp; - } - - /* - * Reset the transaction structure's free item count. - */ - tp->t_items_free = XFS_LIC_NUM_SLOTS; - tp->t_items.lic_next = NULL; -} - - - -/* - * This is called to unlock the items associated with a transaction. - * Items which were not logged should be freed. - * Those which were logged must still be tracked so they can be unpinned - * when the transaction commits. - */ -void -xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn) -{ - xfs_log_item_chunk_t *licp; - xfs_log_item_chunk_t *next_licp; - xfs_log_item_chunk_t **licpp; - int freed; - - freed = 0; - licp = &tp->t_items; - - /* - * Special case the embedded chunk so we don't free. - */ - if (!XFS_LIC_ARE_ALL_FREE(licp)) { - freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - } - licpp = &(tp->t_items.lic_next); - licp = licp->lic_next; - - /* - * Unlock each item in each chunk, free non-dirty descriptors, - * and free empty chunks. - */ - while (licp != NULL) { - ASSERT(!XFS_LIC_ARE_ALL_FREE(licp)); - freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn); - next_licp = licp->lic_next; - if (XFS_LIC_ARE_ALL_FREE(licp)) { - *licpp = next_licp; - kmem_free(licp, sizeof(xfs_log_item_chunk_t)); - freed -= XFS_LIC_NUM_SLOTS; - } else { - licpp = &(licp->lic_next); - } - ASSERT(*licpp == next_licp); - licp = next_licp; - } - - /* - * Fix the free descriptor count in the transaction. - */ - tp->t_items_free += freed; -} - -/* - * Unlock each item pointed to by a descriptor in the given chunk. - * Stamp the commit lsn into each item if necessary. - * Free descriptors pointing to items which are not dirty if freeing_chunk - * is zero. If freeing_chunk is non-zero, then we need to unlock all - * items in the chunk. - * - * Return the number of descriptors freed. - */ -STATIC int -xfs_trans_unlock_chunk( - xfs_log_item_chunk_t *licp, - int freeing_chunk, - int abort, - xfs_lsn_t commit_lsn) -{ - xfs_log_item_desc_t *lidp; - xfs_log_item_t *lip; - int i; - int freed; - - freed = 0; - lidp = licp->lic_descs; - for (i = 0; i < licp->lic_unused; i++, lidp++) { - if (XFS_LIC_ISFREE(licp, i)) { - continue; - } - lip = lidp->lid_item; - lip->li_desc = NULL; - - if (commit_lsn != NULLCOMMITLSN) - IOP_COMMITTING(lip, commit_lsn); - if (abort) - lip->li_flags |= XFS_LI_ABORTED; - IOP_UNLOCK(lip); - - /* - * Free the descriptor if the item is not dirty - * within this transaction and the caller is not - * going to just free the entire thing regardless. - */ - if (!(freeing_chunk) && - (!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) { - XFS_LIC_RELSE(licp, i); - freed++; - } - } - - return freed; -} - - -/* - * This is called to add the given busy item to the transaction's - * list of busy items. It must find a free busy item descriptor - * or allocate a new one and add the item to that descriptor. - * The function returns a pointer to busy descriptor used to point - * to the new busy entry. The log busy entry will now point to its new - * descriptor with its ???? field. - */ -xfs_log_busy_slot_t * -xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_slot_t *lbsp; - int i=0; - - /* - * If there are no free descriptors, allocate a new chunk - * of them and put it at the front of the chunk list. - */ - if (tp->t_busy_free == 0) { - lbcp = (xfs_log_busy_chunk_t*) - kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP); - ASSERT(lbcp != NULL); - /* - * Initialize the chunk, and then - * claim the first slot in the newly allocated chunk. - */ - XFS_LBC_INIT(lbcp); - XFS_LBC_CLAIM(lbcp, 0); - lbcp->lbc_unused = 1; - lbsp = XFS_LBC_SLOT(lbcp, 0); - - /* - * Link in the new chunk and update the free count. - */ - lbcp->lbc_next = tp->t_busy.lbc_next; - tp->t_busy.lbc_next = lbcp; - tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1; - - /* - * Initialize the descriptor and the generic portion - * of the log item. - * - * Point the new slot at this item and return it. - * Also point the log item at its currently active - * descriptor and set the item's mount pointer. - */ - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; - } - - /* - * Find the free descriptor. It is somewhere in the chunklist - * of descriptors. - */ - lbcp = &tp->t_busy; - while (lbcp != NULL) { - if (XFS_LBC_VACANCY(lbcp)) { - if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) { - i = lbcp->lbc_unused; - break; - } else { - /* out-of-order vacancy */ - cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp); - ASSERT(0); - } - } - lbcp = lbcp->lbc_next; - } - ASSERT(lbcp != NULL); - /* - * If we find a free descriptor, claim it, - * initialize it, and return it. - */ - XFS_LBC_CLAIM(lbcp, i); - if (lbcp->lbc_unused <= i) { - lbcp->lbc_unused = i + 1; - } - lbsp = XFS_LBC_SLOT(lbcp, i); - tp->t_busy_free--; - lbsp->lbc_ag = ag; - lbsp->lbc_idx = idx; - return lbsp; -} - - -/* - * xfs_trans_free_busy - * Free all of the busy lists from a transaction - */ -void -xfs_trans_free_busy(xfs_trans_t *tp) -{ - xfs_log_busy_chunk_t *lbcp; - xfs_log_busy_chunk_t *lbcq; - - lbcp = tp->t_busy.lbc_next; - while (lbcp != NULL) { - lbcq = lbcp->lbc_next; - kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t)); - lbcp = lbcq; - } - - XFS_LBC_INIT(&tp->t_busy); - tp->t_busy.lbc_unused = 0; -} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 13edab8a9e9..bd1281862ad 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -22,38 +22,140 @@ struct xfs_log_item; struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; +struct xfs_ail; +struct xfs_log_vec; + + +void xfs_trans_init(struct xfs_mount *); +void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); +void xfs_trans_del_item(struct xfs_log_item *); +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); +void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); + +void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, + xfs_lsn_t commit_lsn, int aborted); +/* + * AIL traversal cursor. + * + * Rather than using a generation number for detecting changes in the ail, use + * a cursor that is protected by the ail lock. The aild cursor exists in the + * struct xfs_ail, but other traversals can declare it on the stack and link it + * to the ail list. + * + * When an object is deleted from or moved int the AIL, the cursor list is + * searched to see if the object is a designated cursor item. If it is, it is + * deleted from the cursor so that the next time the cursor is used traversal + * will return to the start. + * + * This means a traversal colliding with a removal will cause a restart of the + * list scan, rather than any insertion or deletion anywhere in the list. The + * low bit of the item pointer is set if the cursor has been invalidated so + * that we can tell the difference between invalidation and reaching the end + * of the list to trigger traversal restarts. + */ +struct xfs_ail_cursor { + struct list_head list; + struct xfs_log_item *item; +}; /* - * From xfs_trans_item.c + * Private AIL structures. + * + * Eventually we need to drive the locking in here as well. */ -struct xfs_log_item_desc *xfs_trans_add_item(struct xfs_trans *, - struct xfs_log_item *); -void xfs_trans_free_item(struct xfs_trans *, - struct xfs_log_item_desc *); -struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *, - struct xfs_log_item *); -struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); -struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, - struct xfs_log_item_desc *); -void xfs_trans_free_items(struct xfs_trans *, int); -void xfs_trans_unlock_items(struct xfs_trans *, - xfs_lsn_t); -void xfs_trans_free_busy(xfs_trans_t *tp); -xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, - xfs_agnumber_t ag, - xfs_extlen_t idx); +struct xfs_ail { + struct xfs_mount *xa_mount; + struct task_struct *xa_task; + struct list_head xa_ail; + xfs_lsn_t xa_target; + xfs_lsn_t xa_target_prev; + struct list_head xa_cursors; + spinlock_t xa_lock; + xfs_lsn_t xa_last_pushed_lsn; + int xa_log_flush; + struct list_head xa_buf_list; + wait_queue_head_t xa_empty; +}; /* * From xfs_trans_ail.c */ -void xfs_trans_update_ail(struct xfs_mount *, - struct xfs_log_item *, xfs_lsn_t, - unsigned long); -void xfs_trans_delete_ail(struct xfs_mount *, - struct xfs_log_item *, unsigned long); -struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *); -struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *, - struct xfs_log_item *, int *, int *); +void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + struct xfs_log_item **log_items, int nr_items, + xfs_lsn_t lsn) __releases(ailp->xa_lock); +/* + * Return a pointer to the first item in the AIL. If the AIL is empty, then + * return NULL. + */ +static inline struct xfs_log_item * +xfs_ail_min( + struct xfs_ail *ailp) +{ + return list_first_entry_or_null(&ailp->xa_ail, struct xfs_log_item, + li_ail); +} + +static inline void +xfs_trans_ail_update( + struct xfs_ail *ailp, + struct xfs_log_item *lip, + xfs_lsn_t lsn) __releases(ailp->xa_lock) +{ + xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); +} + +void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, + struct xfs_log_item **log_items, int nr_items, + int shutdown_type) + __releases(ailp->xa_lock); +static inline void +xfs_trans_ail_delete( + struct xfs_ail *ailp, + xfs_log_item_t *lip, + int shutdown_type) __releases(ailp->xa_lock) +{ + xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); +} + +void xfs_ail_push(struct xfs_ail *, xfs_lsn_t); +void xfs_ail_push_all(struct xfs_ail *); +void xfs_ail_push_all_sync(struct xfs_ail *); +struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); +struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur, + xfs_lsn_t lsn); +struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp, + struct xfs_ail_cursor *cur); +void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur); +#if BITS_PER_LONG != 64 +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */ + spin_lock(&ailp->xa_lock); + *dst = *src; + spin_unlock(&ailp->xa_lock); +} +#else +static inline void +xfs_trans_ail_copy_lsn( + struct xfs_ail *ailp, + xfs_lsn_t *dst, + xfs_lsn_t *src) +{ + ASSERT(sizeof(xfs_lsn_t) == 8); + *dst = *src; +} +#endif #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c new file mode 100644 index 00000000000..f2bda7c76b8 --- /dev/null +++ b/fs/xfs/xfs_trans_resv.c @@ -0,0 +1,894 @@ +/* + * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (C) 2010 Red Hat, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_da_btree.h" +#include "xfs_inode.h" +#include "xfs_bmap_btree.h" +#include "xfs_ialloc.h" +#include "xfs_quota.h" +#include "xfs_trans.h" +#include "xfs_qm.h" +#include "xfs_trans_space.h" +#include "xfs_trace.h" + +/* + * A buffer has a format structure overhead in the log in addition + * to the data, so we need to take this into account when reserving + * space in a transaction for a buffer. Round the space required up + * to a multiple of 128 bytes so that we don't change the historical + * reservation that has been used for this overhead. + */ +STATIC uint +xfs_buf_log_overhead(void) +{ + return round_up(sizeof(struct xlog_op_header) + + sizeof(struct xfs_buf_log_format), 128); +} + +/* + * Calculate out transaction log reservation per item in bytes. + * + * The nbufs argument is used to indicate the number of items that + * will be changed in a transaction. size is used to tell how many + * bytes should be reserved per item. + */ +STATIC uint +xfs_calc_buf_res( + uint nbufs, + uint size) +{ + return nbufs * (size + xfs_buf_log_overhead()); +} + +/* + * Logging inodes is really tricksy. They are logged in memory format, + * which means that what we write into the log doesn't directly translate into + * the amount of space they use on disk. + * + * Case in point - btree format forks in memory format use more space than the + * on-disk format. In memory, the buffer contains a normal btree block header so + * the btree code can treat it as though it is just another generic buffer. + * However, when we write it to the inode fork, we don't write all of this + * header as it isn't needed. e.g. the root is only ever in the inode, so + * there's no need for sibling pointers which would waste 16 bytes of space. + * + * Hence when we have an inode with a maximally sized btree format fork, then + * amount of information we actually log is greater than the size of the inode + * on disk. Hence we need an inode reservation function that calculates all this + * correctly. So, we log: + * + * - 4 log op headers for object + * - for the ilf, the inode core and 2 forks + * - inode log format object + * - the inode core + * - two inode forks containing bmap btree root blocks. + * - the btree data contained by both forks will fit into the inode size, + * hence when combined with the inode core above, we have a total of the + * actual inode size. + * - the BMBT headers need to be accounted separately, as they are + * additional to the records and pointers that fit inside the inode + * forks. + */ +STATIC uint +xfs_calc_inode_res( + struct xfs_mount *mp, + uint ninodes) +{ + return ninodes * + (4 * sizeof(struct xlog_op_header) + + sizeof(struct xfs_inode_log_format) + + mp->m_sb.sb_inodesize + + 2 * XFS_BMBT_BLOCK_LEN(mp)); +} + +/* + * The free inode btree is a conditional feature and the log reservation + * requirements differ slightly from that of the traditional inode allocation + * btree. The finobt tracks records for inode chunks with at least one free + * inode. A record can be removed from the tree for an inode allocation + * or free and thus the finobt reservation is unconditional across: + * + * - inode allocation + * - inode free + * - inode chunk allocation + * + * The 'modify' param indicates to include the record modification scenario. The + * 'alloc' param indicates to include the reservation for free space btree + * modifications on behalf of finobt modifications. This is required only for + * transactions that do not already account for free space btree modifications. + * + * the free inode btree: max depth * block size + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the free inode btree entry: block size + */ +STATIC uint +xfs_calc_finobt_res( + struct xfs_mount *mp, + int alloc, + int modify) +{ + uint res; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)); + if (alloc) + res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); + if (modify) + res += (uint)XFS_FSB_TO_B(mp, 1); + + return res; +} + +/* + * Various log reservation values. + * + * These are based on the size of the file system block because that is what + * most transactions manipulate. Each adds in an additional 128 bytes per + * item logged to try to account for the overhead of the transaction mechanism. + * + * Note: Most of the reservations underestimate the number of allocation + * groups into which they could free extents in the xfs_bmap_finish() call. + * This is because the number in the worst case is quite high and quite + * unusual. In order to fix this we need to change xfs_bmap_finish() to free + * extents in only a single AG at a time. This will require changes to the + * EFI code as well, however, so that the EFI for the extents not freed is + * logged again in each transaction. See SGI PV #261917. + * + * Reservation functions here avoid a huge stack in xfs_trans_init due to + * register overflow from temporaries in the calculations. + */ + + +/* + * In a write transaction we can allocate a maximum of 2 + * extents. This gives: + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join: + * the agfs of the ags containing the blocks: 2 * sector size + * the agfls of the ags containing the blocks: 2 * sector size + * the super block free block counter: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_write_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * In truncating a file we free up to two extents at once. We can modify: + * the inode being truncated: inode size + * the inode's bmap btree: (max depth + 1) * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_itruncate_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(5, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0))); +} + +/* + * In renaming a files we can modify: + * the four inodes involved: 4 * inode size + * the two directory btrees: 2 * (max depth + v2) * dir block size + * the two directory bmap btrees: 2 * max depth * block size + * And the bmap_finish transaction can free dir and bmap blocks (two sets + * of bmap blocks) giving: + * the agf for the ags in which the blocks live: 3 * sector size + * the agfl for the ags in which the blocks live: 3 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_rename_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 4) + + xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For removing an inode from unlinked list at first, we can modify: + * the agi hash list and counters: sector size + * the on disk inode before ours in the agi hash list: inode cluster size + */ +STATIC uint +xfs_calc_iunlink_remove_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size); +} + +/* + * For creating a link to an inode: + * the parent directory inode: inode size + * the linked inode: inode size + * the directory btree could split: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free some bmap blocks giving: + * the agf for the ag in which the blocks live: sector size + * the agfl for the ag in which the blocks live: sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_link_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_remove_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For adding an inode to unlinked list we can modify: + * the agi hash list: sector size + * the unlinked inode: inode size + */ +STATIC uint +xfs_calc_iunlink_add_reservation(xfs_mount_t *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 1); +} + +/* + * For removing a directory entry we can modify: + * the parent directory inode: inode size + * the removed inode: inode size + * the directory btree could join: (max depth + v2) * dir block size + * the directory bmap btree could join or split: (max depth + v2) * blocksize + * And the bmap_finish transaction can free the dir and bmap blocks giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_remove_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_iunlink_add_reservation(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * For create, break it in to the two cases that the transaction + * covers. We start with the modify case - allocation done by modification + * of the state of existing inodes - and the allocation case. + */ + +/* + * For create we can modify: + * the parent directory inode: inode size + * the new inode: inode size + * the inode btree entry: block size + * the superblock for the nlink flag: sector size + * the directory btree: (max depth + v2) * dir block size + * the directory inode's bmap btree: (max depth + v2) * block size + * the finobt (record modification and allocation btrees) + */ +STATIC uint +xfs_calc_create_resv_modify( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + (uint)XFS_FSB_TO_B(mp, 1) + + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 1, 1); +} + +/* + * For create we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode blocks allocated: mp->m_ialloc_blks * blocksize + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + */ +STATIC uint +xfs_calc_create_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +STATIC uint +__xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_create_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +/* + * For icreate we can allocate some inodes giving: + * the agi and agf of the ag getting the new inodes: 2 * sectorsize + * the superblock for the nlink flag: sector size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion) + */ +STATIC uint +xfs_calc_icreate_resv_alloc( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + mp->m_sb.sb_sectsize + + xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 0); +} + +STATIC uint +xfs_calc_icreate_reservation(xfs_mount_t *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX(xfs_calc_icreate_resv_alloc(mp), + xfs_calc_create_resv_modify(mp)); +} + +STATIC uint +xfs_calc_create_reservation( + struct xfs_mount *mp) +{ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return xfs_calc_icreate_reservation(mp); + return __xfs_calc_create_reservation(mp); + +} + +STATIC uint +xfs_calc_create_tmpfile_reservation( + struct xfs_mount *mp) +{ + uint res = XFS_DQUOT_LOGRES(mp); + + if (xfs_sb_version_hascrc(&mp->m_sb)) + res += xfs_calc_icreate_resv_alloc(mp); + else + res += xfs_calc_create_resv_alloc(mp); + + return res + xfs_calc_iunlink_add_reservation(mp); +} + +/* + * Making a new directory is the same as creating a new file. + */ +STATIC uint +xfs_calc_mkdir_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp); +} + + +/* + * Making a new symplink is the same as creating a new file, but + * with the added blocks for remote symlink data which can be up to 1kB in + * length (MAXPATHLEN). + */ +STATIC uint +xfs_calc_symlink_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_create_reservation(mp) + + xfs_calc_buf_res(1, MAXPATHLEN); +} + +/* + * In freeing an inode we can modify: + * the inode being freed: inode size + * the super block free inode counter: sector size + * the agi hash list and counters: sector size + * the inode btree entry: block size + * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size + * the finobt (record insertion, removal or modification) + */ +STATIC uint +xfs_calc_ifree_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + + xfs_calc_iunlink_remove_reservation(mp) + + xfs_calc_buf_res(1, 0) + + xfs_calc_buf_res(2 + mp->m_ialloc_blks + + mp->m_in_maxlevels, 0) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_finobt_res(mp, 0, 1); +} + +/* + * When only changing the inode we log the inode and possibly the superblock + * We also add a bit of slop for the transaction stuff. + */ +STATIC uint +xfs_calc_ichange_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); + +} + +/* + * Growing the data section of the filesystem. + * superblock + * agi and agf + * allocation btrees + */ +STATIC uint +xfs_calc_growdata_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the first set of transactions (ALLOC) we allocate space to the + * bitmap or summary files. + * superblock: sector size + * agf of the ag from which the extent is allocated: sector size + * bmap btree for bitmap/summary inode: max depth * blocksize + * bitmap/summary inode: inode size + * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize + */ +STATIC uint +xfs_calc_growrtalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Growing the rt section of the filesystem. + * In the second set of transactions (ZERO) we zero the new metadata blocks. + * one bitmap/summary block: blocksize + */ +STATIC uint +xfs_calc_growrtzero_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize); +} + +/* + * Growing the rt section of the filesystem. + * In the third set of transactions (FREE) we update metadata without + * allocating any new blocks. + * superblock: sector size + * bitmap inode: inode size + * summary inode: inode size + * one bitmap block: blocksize + * summary blocks: new summary size + */ +STATIC uint +xfs_calc_growrtfree_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_inode_res(mp, 2) + + xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(1, mp->m_rsumsize); +} + +/* + * Logging the inode modification timestamp on a synchronous write. + * inode + */ +STATIC uint +xfs_calc_swrite_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Logging the inode mode bits when writing a setuid/setgid file + * inode + */ +STATIC uint +xfs_calc_writeid_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_inode_res(mp, 1); +} + +/* + * Converting the inode from non-attributed to attributed. + * the inode being converted: inode size + * agf block and superblock (for block allocation) + * the new block (directory sized) + * bmap blocks for the new directory block + * allocation btrees + */ +STATIC uint +xfs_calc_addafork_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(1, mp->m_dir_geo->blksize) + + xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1, + XFS_FSB_TO_B(mp, 1)) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing the attribute fork of a file + * the inode being truncated: inode size + * the inode's bmap btree: max depth * block size + * And the bmap_finish transaction can free the blocks and bmap blocks: + * the agf for each of the ags: 4 * sector size + * the agfl for each of the ags: 4 * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming 4 extents: + * 4 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrinval_reservation( + struct xfs_mount *mp) +{ + return MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1))), + (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Setting an attribute at mount time. + * the inode getting the attribute + * the superblock for allocations + * the agfs extents are allocated from + * the attribute btree * max depth + * the inode allocation btree + * Since attribute transaction space is dependent on the size of the attribute, + * the calculation is done partially at mount time and partially at runtime(see + * below). + */ +STATIC uint +xfs_calc_attrsetm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1)); +} + +/* + * Setting an attribute at runtime, transaction space unit per block. + * the superblock for allocations: sector size + * the inode bmap btree could join or split: max depth * block size + * Since the runtime attribute transaction space is dependent on the total + * blocks needed for the 1st bmap, here we calculate out the space unit for + * one block so that the caller could figure out the total space according + * to the attibute extent length in blocks by: + * ext * M_RES(mp)->tr_attrsetrt.tr_logres + */ +STATIC uint +xfs_calc_attrsetrt_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK), + XFS_FSB_TO_B(mp, 1)); +} + +/* + * Removing an attribute. + * the inode: inode size + * the attribute btree could join: max depth * block size + * the inode bmap btree could join or split: max depth * block size + * And the bmap_finish transaction can free the attr blocks freed giving: + * the agf for the ag in which the blocks live: 2 * sector size + * the agfl for the ag in which the blocks live: 2 * sector size + * the superblock for the free block count: sector size + * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +STATIC uint +xfs_calc_attrrm_reservation( + struct xfs_mount *mp) +{ + return XFS_DQUOT_LOGRES(mp) + + MAX((xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, + XFS_FSB_TO_B(mp, 1)) + + (uint)XFS_FSB_TO_B(mp, + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)), + (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2), + XFS_FSB_TO_B(mp, 1)))); +} + +/* + * Clearing a bad agino number in an agi hash bucket. + */ +STATIC uint +xfs_calc_clear_agi_bucket_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Clearing the quotaflags in the superblock. + * the super block for changing quota flags: sector size + */ +STATIC uint +xfs_calc_qm_sbchange_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * Adjusting quota limits. + * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot) + */ +STATIC uint +xfs_calc_qm_setqlim_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot)); +} + +/* + * Allocating quota on disk if needed. + * the write transaction log space for quota file extent allocation + * the unit of quota allocation: one system block size + */ +STATIC uint +xfs_calc_qm_dqalloc_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_write_reservation(mp) + + xfs_calc_buf_res(1, + XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1); +} + +/* + * Turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + * the superblock for the quota flags: sector size + */ +STATIC uint +xfs_calc_qm_quotaoff_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2 + + xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +/* + * End of turning off quotas. + * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2 + */ +STATIC uint +xfs_calc_qm_quotaoff_end_reservation( + struct xfs_mount *mp) +{ + return sizeof(struct xfs_qoff_logitem) * 2; +} + +/* + * Syncing the incore super block changes to disk. + * the super block to reflect the changes: sector size + */ +STATIC uint +xfs_calc_sb_reservation( + struct xfs_mount *mp) +{ + return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); +} + +void +xfs_trans_resv_calc( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* + * The following transactions are logged in physical format and + * require a permanent reservation on space. + */ + resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); + resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); + resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT; + resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp); + resp->tr_rename.tr_logcount = XFS_RENAME_LOG_COUNT; + resp->tr_rename.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_link.tr_logres = xfs_calc_link_reservation(mp); + resp->tr_link.tr_logcount = XFS_LINK_LOG_COUNT; + resp->tr_link.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_remove.tr_logres = xfs_calc_remove_reservation(mp); + resp->tr_remove.tr_logcount = XFS_REMOVE_LOG_COUNT; + resp->tr_remove.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_symlink.tr_logres = xfs_calc_symlink_reservation(mp); + resp->tr_symlink.tr_logcount = XFS_SYMLINK_LOG_COUNT; + resp->tr_symlink.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create.tr_logres = xfs_calc_create_reservation(mp); + resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT; + resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_create_tmpfile.tr_logres = + xfs_calc_create_tmpfile_reservation(mp); + resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT; + resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp); + resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT; + resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_ifree.tr_logres = xfs_calc_ifree_reservation(mp); + resp->tr_ifree.tr_logcount = XFS_INACTIVE_LOG_COUNT; + resp->tr_ifree.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_addafork.tr_logres = xfs_calc_addafork_reservation(mp); + resp->tr_addafork.tr_logcount = XFS_ADDAFORK_LOG_COUNT; + resp->tr_addafork.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrinval.tr_logres = xfs_calc_attrinval_reservation(mp); + resp->tr_attrinval.tr_logcount = XFS_ATTRINVAL_LOG_COUNT; + resp->tr_attrinval.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrsetm.tr_logres = xfs_calc_attrsetm_reservation(mp); + resp->tr_attrsetm.tr_logcount = XFS_ATTRSET_LOG_COUNT; + resp->tr_attrsetm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_attrrm.tr_logres = xfs_calc_attrrm_reservation(mp); + resp->tr_attrrm.tr_logcount = XFS_ATTRRM_LOG_COUNT; + resp->tr_attrrm.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_growrtalloc.tr_logres = xfs_calc_growrtalloc_reservation(mp); + resp->tr_growrtalloc.tr_logcount = XFS_DEFAULT_PERM_LOG_COUNT; + resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); + resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; + resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; + + /* + * The following transactions are logged in logical format with + * a default log count. + */ + resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp); + resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp); + resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); + resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_qm_equotaoff.tr_logres = + xfs_calc_qm_quotaoff_end_reservation(mp); + resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); + resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; + + /* The following transaction are logged in logical format */ + resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp); + resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp); + resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp); + resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp); + resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp); + resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); + resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); + resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); +} diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h new file mode 100644 index 00000000000..1097d14cd58 --- /dev/null +++ b/fs/xfs/xfs_trans_resv.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef __XFS_TRANS_RESV_H__ +#define __XFS_TRANS_RESV_H__ + +struct xfs_mount; + +/* + * structure for maintaining pre-calculated transaction reservations. + */ +struct xfs_trans_res { + uint tr_logres; /* log space unit in bytes per log ticket */ + int tr_logcount; /* number of log operations per log ticket */ + int tr_logflags; /* log flags, currently only used for indicating + * a reservation request is permanent or not */ +}; + +struct xfs_trans_resv { + struct xfs_trans_res tr_write; /* extent alloc trans */ + struct xfs_trans_res tr_itruncate; /* truncate trans */ + struct xfs_trans_res tr_rename; /* rename trans */ + struct xfs_trans_res tr_link; /* link trans */ + struct xfs_trans_res tr_remove; /* unlink trans */ + struct xfs_trans_res tr_symlink; /* symlink trans */ + struct xfs_trans_res tr_create; /* create trans */ + struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */ + struct xfs_trans_res tr_mkdir; /* mkdir trans */ + struct xfs_trans_res tr_ifree; /* inode free trans */ + struct xfs_trans_res tr_ichange; /* inode update trans */ + struct xfs_trans_res tr_growdata; /* fs data section grow trans */ + struct xfs_trans_res tr_addafork; /* add inode attr fork trans */ + struct xfs_trans_res tr_writeid; /* write setuid/setgid file */ + struct xfs_trans_res tr_attrinval; /* attr fork buffer + * invalidation */ + struct xfs_trans_res tr_attrsetm; /* set/create an attribute at + * mount time */ + struct xfs_trans_res tr_attrsetrt; /* set/create an attribute at + * runtime */ + struct xfs_trans_res tr_attrrm; /* remove an attribute */ + struct xfs_trans_res tr_clearagi; /* clear agi unlinked bucket */ + struct xfs_trans_res tr_growrtalloc; /* grow realtime allocations */ + struct xfs_trans_res tr_growrtzero; /* grow realtime zeroing */ + struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ + struct xfs_trans_res tr_qm_sbchange; /* change quota flags */ + struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ + struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ + struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ + struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ + struct xfs_trans_res tr_sb; /* modify superblock */ + struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ +}; + +/* shorthand way of accessing reservation structure */ +#define M_RES(mp) (&(mp)->m_resv) + +/* + * Per-extent log reservation for the allocation btree changes + * involved in freeing or allocating an extent. + * 2 trees * (2 blocks/level * max depth - 1) * block size + */ +#define XFS_ALLOCFREE_LOG_RES(mp,nx) \ + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) +#define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ + ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + +/* + * Per-directory log reservation for any directory change. + * dir blocks: (1 btree block per level + data block + free block) * dblock size + * bmap btree: (levels + 2) * max depth * block size + * v2 directory blocks can be fragmented below the dirblksize down to the fsb + * size, so account for that in the DAENTER macros. + */ +#define XFS_DIROP_LOG_RES(mp) \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK)) + \ + (XFS_FSB_TO_B(mp, XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1))) +#define XFS_DIROP_LOG_COUNT(mp) \ + (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ + XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) + +/* + * Various log count values. + */ +#define XFS_DEFAULT_LOG_COUNT 1 +#define XFS_DEFAULT_PERM_LOG_COUNT 2 +#define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 +#define XFS_CREATE_LOG_COUNT 2 +#define XFS_CREATE_TMPFILE_LOG_COUNT 2 +#define XFS_MKDIR_LOG_COUNT 3 +#define XFS_SYMLINK_LOG_COUNT 3 +#define XFS_REMOVE_LOG_COUNT 2 +#define XFS_LINK_LOG_COUNT 2 +#define XFS_RENAME_LOG_COUNT 2 +#define XFS_WRITE_LOG_COUNT 2 +#define XFS_ADDAFORK_LOG_COUNT 2 +#define XFS_ATTRINVAL_LOG_COUNT 1 +#define XFS_ATTRSET_LOG_COUNT 3 +#define XFS_ATTRRM_LOG_COUNT 3 + +void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); + +#endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h index 4ea2e5074bd..bf9c4579334 100644 --- a/fs/xfs/xfs_trans_space.h +++ b/fs/xfs/xfs_trans_space.h @@ -28,7 +28,8 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) -#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1) +#define XFS_DAENTER_1B(mp,w) \ + ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ (XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0)) #define XFS_DAENTER_BLOCKS(mp,w) \ @@ -47,13 +48,15 @@ #define XFS_DIRREMOVE_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ - (XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp)-1) + ((mp)->m_ialloc_blks + \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \ + ((mp)->m_in_maxlevels - 1))) /* * Space reservation values for various transactions. */ #define XFS_ADDAFORK_SPACE_RES(mp) \ - ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) + ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK)) #define XFS_ATTRRM_SPACE_RES(mp) \ XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK) /* This macro is not used - see inline code in xfs_attr_set */ @@ -82,5 +85,8 @@ (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) +#define XFS_IFREE_SPACE_RES(mp) \ + (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) + #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 104f64a9879..65c6e6650b1 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -18,53 +18,10 @@ #ifndef __XFS_TYPES_H__ #define __XFS_TYPES_H__ -#ifdef __KERNEL__ - -/* - * POSIX Extensions - */ -typedef unsigned char uchar_t; -typedef unsigned short ushort_t; -typedef unsigned int uint_t; -typedef unsigned long ulong_t; - -/* - * Additional type declarations for XFS - */ -typedef signed char __int8_t; -typedef unsigned char __uint8_t; -typedef signed short int __int16_t; -typedef unsigned short int __uint16_t; -typedef signed int __int32_t; -typedef unsigned int __uint32_t; -typedef signed long long int __int64_t; -typedef unsigned long long int __uint64_t; - -typedef enum { B_FALSE,B_TRUE } boolean_t; -typedef __uint32_t prid_t; /* project ID */ -typedef __uint32_t inst_t; /* an instruction */ - -typedef __s64 xfs_off_t; /* <file offset> type */ -typedef __u64 xfs_ino_t; /* <inode> type */ -typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ -typedef __u32 xfs_dev_t; -typedef __u32 xfs_nlink_t; - -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - -#endif /* __KERNEL__ */ +typedef __uint32_t prid_t; /* project ID */ typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ +typedef __uint32_t xfs_agino_t; /* inode # within allocation grp */ typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ typedef __uint32_t xfs_agnumber_t; /* allocation group number */ typedef __int32_t xfs_extnum_t; /* # of extents in a file */ @@ -81,8 +38,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */ typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ - /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: @@ -111,7 +66,6 @@ typedef __uint64_t xfs_fileoff_t; /* block number in a file */ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ -typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ /* * Null values for the types. @@ -132,6 +86,9 @@ typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ #define NULLCOMMITLSN ((xfs_lsn_t)-1) +#define NULLFSINO ((xfs_ino_t)-1) +#define NULLAGINO ((xfs_agino_t)-1) + /* * Max values for extlen, extnum, aextnum. */ @@ -140,6 +97,26 @@ typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ #define MAXAEXTNUM ((xfs_aextnum_t)0x7fff) /* signed short */ /* + * Minimum and maximum blocksize and sectorsize. + * The blocksize upper limit is pretty much arbitrary. + * The sectorsize upper limit is due to sizeof(sb_sectsize). + */ +#define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ +#define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) +#define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ +#define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ +#define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) +#define XFS_MAX_SECTORSIZE (1 << XFS_MAX_SECTORSIZE_LOG) + +/* + * Inode fork identifiers. + */ +#define XFS_DATA_FORK 0 +#define XFS_ATTR_FORK 1 + +/* * Min numbers of data/attr fork btree root pointers. */ #define MINDBTPTRS 3 @@ -151,25 +128,35 @@ typedef __uint8_t xfs_arch_t; /* architecture of an xfs fs */ */ #define MAXNAMELEN 256 -typedef struct xfs_dirent { /* data from readdir() */ - xfs_ino_t d_ino; /* inode number of entry */ - xfs_off_t d_off; /* offset of disk directory entry */ - unsigned short d_reclen; /* length of this record */ - char d_name[1]; /* name of file */ -} xfs_dirent_t; - -#define DIRENTBASESIZE (((xfs_dirent_t *)0)->d_name - (char *)0) -#define DIRENTSIZE(namelen) \ - ((DIRENTBASESIZE + (namelen) + \ - sizeof(xfs_off_t)) & ~(sizeof(xfs_off_t) - 1)) - typedef enum { XFS_LOOKUP_EQi, XFS_LOOKUP_LEi, XFS_LOOKUP_GEi } xfs_lookup_t; typedef enum { XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi, - XFS_BTNUM_MAX + XFS_BTNUM_FINOi, XFS_BTNUM_MAX } xfs_btnum_t; +struct xfs_name { + const unsigned char *name; + int len; + int type; +}; + +/* + * uid_t and gid_t are hard-coded to 32 bits in the inode. + * Hence, an 'id' in a dquot is 32 bits.. + */ +typedef __uint32_t xfs_dqid_t; + +/* + * Constants for bit manipulations. + */ +#define XFS_NBBYLOG 3 /* log2(NBBY) */ +#define XFS_WORDLOG 2 /* log2(sizeof(xfs_rtword_t)) */ +#define XFS_NBWORDLOG (XFS_NBBYLOG + XFS_WORDLOG) +#define XFS_NBWORD (1 << XFS_NBWORDLOG) +#define XFS_WORDMASK ((1 << XFS_WORDLOG) - 1) + + #endif /* __XFS_TYPES_H__ */ diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c deleted file mode 100644 index 9014d7e4448..00000000000 --- a/fs/xfs/xfs_utils.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_bmap_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_bmap.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_rw.h" -#include "xfs_itable.h" -#include "xfs_utils.h" - -/* - * xfs_get_dir_entry is used to get a reference to an inode given - * its parent directory inode and the name of the file. It does - * not lock the child inode, and it unlocks the directory before - * returning. The directory's generation number is returned for - * use by a later call to xfs_lock_dir_and_entry. - */ -int -xfs_get_dir_entry( - bhv_vname_t *dentry, - xfs_inode_t **ipp) -{ - bhv_vnode_t *vp; - - vp = VNAME_TO_VNODE(dentry); - - *ipp = xfs_vtoi(vp); - if (!*ipp) - return XFS_ERROR(ENOENT); - VN_HOLD(vp); - return 0; -} - -int -xfs_dir_lookup_int( - bhv_desc_t *dir_bdp, - uint lock_mode, - bhv_vname_t *dentry, - xfs_ino_t *inum, - xfs_inode_t **ipp) -{ - bhv_vnode_t *dir_vp; - xfs_inode_t *dp; - int error; - - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - - error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum); - if (!error) { - /* - * Unlock the directory. We do this because we can't - * hold the directory lock while doing the vn_get() - * in xfs_iget(). Doing so could cause us to hold - * a lock while waiting for the inode to finish - * being inactive while it's waiting for a log - * reservation in the inactive routine. - */ - xfs_iunlock(dp, lock_mode); - error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0); - xfs_ilock(dp, lock_mode); - - if (error) { - *ipp = NULL; - } else if ((*ipp)->i_d.di_mode == 0) { - /* - * The inode has been freed. Something is - * wrong so just get out of here. - */ - xfs_iunlock(dp, lock_mode); - xfs_iput_new(*ipp, 0); - *ipp = NULL; - xfs_ilock(dp, lock_mode); - error = XFS_ERROR(ENOENT); - } - } - return error; -} - -/* - * Allocates a new inode from disk and return a pointer to the - * incore copy. This routine will internally commit the current - * transaction and allocate a new one if the Space Manager needed - * to do an allocation to replenish the inode free-list. - * - * This routine is designed to be called from xfs_create and - * xfs_create_dir. - * - */ -int -xfs_dir_ialloc( - xfs_trans_t **tpp, /* input: current transaction; - output: may be a new transaction. */ - xfs_inode_t *dp, /* directory within whose allocate - the inode. */ - mode_t mode, - xfs_nlink_t nlink, - xfs_dev_t rdev, - cred_t *credp, - prid_t prid, /* project id */ - int okalloc, /* ok to allocate new space */ - xfs_inode_t **ipp, /* pointer to inode; it will be - locked. */ - int *committed) - -{ - xfs_trans_t *tp; - xfs_trans_t *ntp; - xfs_inode_t *ip; - xfs_buf_t *ialloc_context = NULL; - boolean_t call_again = B_FALSE; - int code; - uint log_res; - uint log_count; - void *dqinfo; - uint tflags; - - tp = *tpp; - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - - /* - * xfs_ialloc will return a pointer to an incore inode if - * the Space Manager has an available inode on the free - * list. Otherwise, it will do an allocation and replenish - * the freelist. Since we can only do one allocation per - * transaction without deadlocks, we will need to commit the - * current transaction and start a new one. We will then - * need to call xfs_ialloc again to get the inode. - * - * If xfs_ialloc did an allocation to replenish the freelist, - * it returns the bp containing the head of the freelist as - * ialloc_context. We will hold a lock on it across the - * transaction commit so that no other process can steal - * the inode(s) that we've just allocated. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, okalloc, - &ialloc_context, &call_again, &ip); - - /* - * Return an error if we were unable to allocate a new inode. - * This should only happen if we run out of space on disk or - * encounter a disk error. - */ - if (code) { - *ipp = NULL; - return code; - } - if (!call_again && (ip == NULL)) { - *ipp = NULL; - return XFS_ERROR(ENOSPC); - } - - /* - * If call_again is set, then we were unable to get an - * inode in one operation. We need to commit the current - * transaction and call xfs_ialloc() again. It is guaranteed - * to succeed the second time. - */ - if (call_again) { - - /* - * Normally, xfs_trans_commit releases all the locks. - * We call bhold to hang on to the ialloc_context across - * the commit. Holding this buffer prevents any other - * processes from doing any allocations in this - * allocation group. - */ - xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - log_res = xfs_trans_get_log_res(tp); - log_count = xfs_trans_get_log_count(tp); - - /* - * We want the quota changes to be associated with the next - * transaction, NOT this one. So, detach the dqinfo from this - * and attach it to the next transaction. - */ - dqinfo = NULL; - tflags = 0; - if (tp->t_dqinfo) { - dqinfo = (void *)tp->t_dqinfo; - tp->t_dqinfo = NULL; - tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY; - tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); - } - - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0, NULL); - tp = ntp; - if (committed != NULL) { - *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - XFS_TRANS_FREE_DQINFO(tp->t_mountp, tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - code = xfs_trans_reserve(tp, 0, log_res, 0, - XFS_TRANS_PERM_LOG_RES, log_count); - /* - * Re-attach the quota info that we detached from prev trx. - */ - if (dqinfo) { - tp->t_dqinfo = dqinfo; - tp->t_flags |= tflags; - } - - if (code) { - xfs_buf_relse(ialloc_context); - *tpp = ntp; - *ipp = NULL; - return code; - } - xfs_trans_bjoin(tp, ialloc_context); - - /* - * Call ialloc again. Since we've locked out all - * other allocations in this allocation group, - * this call should always succeed. - */ - code = xfs_ialloc(tp, dp, mode, nlink, rdev, credp, prid, - okalloc, &ialloc_context, &call_again, &ip); - - /* - * If we get an error at this point, return to the caller - * so that the current transaction can be aborted. - */ - if (code) { - *tpp = tp; - *ipp = NULL; - return code; - } - ASSERT ((!call_again) && (ip != NULL)); - - } else { - if (committed != NULL) { - *committed = 0; - } - } - - *ipp = ip; - *tpp = tp; - - return 0; -} - -/* - * Decrement the link count on an inode & log the change. - * If this causes the link count to go to zero, initiate the - * logging activity required to truncate a file. - */ -int /* error */ -xfs_droplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - int error; - - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); - - ASSERT (ip->i_d.di_nlink > 0); - ip->i_d.di_nlink--; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - error = 0; - if (ip->i_d.di_nlink == 0) { - /* - * We're dropping the last link to this file. - * Move the on-disk inode to the AGI unlinked list. - * From xfs_inactive() we will pull the inode from - * the list and free it. - */ - error = xfs_iunlink(tp, ip); - } - return error; -} - -/* - * This gets called when the inode's version needs to be changed from 1 to 2. - * Currently this happens when the nlink field overflows the old 16-bit value - * or when chproj is called to change the project for the first time. - * As a side effect the superblock version will also get rev'd - * to contain the NLINK bit. - */ -void -xfs_bump_ino_vers2( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - xfs_mount_t *mp; - unsigned long s; - - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); - ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); - - ip->i_d.di_version = XFS_DINODE_VERSION_2; - ip->i_d.di_onlink = 0; - memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad)); - mp = tp->t_mountp; - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - s = XFS_SB_LOCK(mp); - if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) { - XFS_SB_VERSION_ADDNLINK(&mp->m_sb); - XFS_SB_UNLOCK(mp, s); - xfs_mod_sb(tp, XFS_SB_VERSIONNUM); - } else { - XFS_SB_UNLOCK(mp, s); - } - } - /* Caller must log the inode */ -} - -/* - * Increment the link count on an inode & log the change. - */ -int -xfs_bumplink( - xfs_trans_t *tp, - xfs_inode_t *ip) -{ - if (ip->i_d.di_nlink >= XFS_MAXLINK) - return XFS_ERROR(EMLINK); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); - - ASSERT(ip->i_d.di_nlink > 0); - ip->i_d.di_nlink++; - if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) && - (ip->i_d.di_nlink > XFS_MAXLINK_1)) { - /* - * The inode has increased its number of links beyond - * what can fit in an old format inode. It now needs - * to be converted to a version 2 inode with a 32 bit - * link count. If this is the first inode in the file - * system to do this, then we need to bump the superblock - * version number as well. - */ - xfs_bump_ino_vers2(tp, ip); - } - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return 0; -} - -/* - * Try to truncate the given file to 0 length. Currently called - * only out of xfs_remove when it has to truncate a file to free - * up space for the remove to proceed. - */ -int -xfs_truncate_file( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error; - -#ifdef QUOTADEBUG - /* - * This is called to truncate the quotainodes too. - */ - if (XFS_IS_UQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_uquotino) - ASSERT(ip->i_udquot); - } - if (XFS_IS_OQUOTA_ON(mp)) { - if (ip->i_ino != mp->m_sb.sb_gquotino) - ASSERT(ip->i_gdquot); - } -#endif - /* - * Make the call to xfs_itruncate_start before starting the - * transaction, because we cannot make the call while we're - * in a transaction. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, (xfs_fsize_t)0); - - tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - /* - * Follow the normal truncate locking protocol. Since we - * hold the inode in the transaction, we know that it's number - * of references will stay constant. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); - /* - * Signal a sync xaction. The only case where that isn't - * the case is if we're truncating an already unlinked file - * on a wsync fs. In that case, we know the blocks can't - * reappear in the file because the links to file are - * permanently toast. Currently, we're always going to - * want a sync transaction because this code is being - * called from places where nlink is guaranteed to be 1 - * but I'm leaving the tests in to protect against future - * changes -- rcc. - */ - error = xfs_itruncate_finish(&tp, ip, (xfs_fsize_t)0, - XFS_DATA_FORK, - ((ip->i_d.di_nlink != 0 || - !(mp->m_flags & XFS_MOUNT_WSYNC)) - ? 1 : 0)); - if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); - } else { - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, - NULL); - } - xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - return error; -} diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h deleted file mode 100644 index fe953e98afa..00000000000 --- a/fs/xfs/xfs_utils.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_UTILS_H__ -#define __XFS_UTILS_H__ - -#define IRELE(ip) VN_RELE(XFS_ITOV(ip)) -#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) -#define ITRACE(ip) vn_trace_ref(XFS_ITOV(ip), __FILE__, __LINE__, \ - (inst_t *)__return_address) - -extern int xfs_rename (bhv_desc_t *, bhv_vname_t *, bhv_vnode_t *, - bhv_vname_t *, cred_t *); -extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **); -extern int xfs_dir_lookup_int (bhv_desc_t *, uint, bhv_vname_t *, xfs_ino_t *, - xfs_inode_t **); -extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *); -extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, - xfs_dev_t, cred_t *, prid_t, int, - xfs_inode_t **, int *); -extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *); -extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *); -extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *); - -#endif /* __XFS_UTILS_H__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c deleted file mode 100644 index a34796e57af..00000000000 --- a/fs/xfs/xfs_vfsops.c +++ /dev/null @@ -1,1993 +0,0 @@ -/* - * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_btree.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" -#include "xfs_quota.h" -#include "xfs_error.h" -#include "xfs_bmap.h" -#include "xfs_rw.h" -#include "xfs_refcache.h" -#include "xfs_buf_item.h" -#include "xfs_log_priv.h" -#include "xfs_dir2_trace.h" -#include "xfs_extfree_item.h" -#include "xfs_acl.h" -#include "xfs_attr.h" -#include "xfs_clnt.h" -#include "xfs_fsops.h" - -STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); - -int -xfs_init(void) -{ - extern kmem_zone_t *xfs_bmap_free_item_zone; - extern kmem_zone_t *xfs_btree_cur_zone; - extern kmem_zone_t *xfs_trans_zone; - extern kmem_zone_t *xfs_buf_item_zone; - extern kmem_zone_t *xfs_dabuf_zone; -#ifdef XFS_DABUF_DEBUG - extern lock_t xfs_dabuf_global_lock; - spinlock_init(&xfs_dabuf_global_lock, "xfsda"); -#endif - - /* - * Initialize all of the zone allocators we use. - */ - xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), - "xfs_bmap_free_item"); - xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), - "xfs_btree_cur"); - xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); - xfs_da_state_zone = - kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state"); - xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); - xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); - - /* - * The size of the zone allocated buf log item is the maximum - * size possible under XFS. This wastes a little bit of memory, - * but it is much faster. - */ - xfs_buf_item_zone = - kmem_zone_init((sizeof(xfs_buf_log_item_t) + - (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / - NBWORD) * sizeof(int))), - "xfs_buf_item"); - xfs_efd_zone = - kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efd_item"); - xfs_efi_zone = - kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * - sizeof(xfs_extent_t))), - "xfs_efi_item"); - - /* - * These zones warrant special memory allocator hints - */ - xfs_inode_zone = - kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | - KM_ZONE_SPREAD, NULL); - xfs_ili_zone = - kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", - KM_ZONE_SPREAD, NULL); - xfs_chashlist_zone = - kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist", - KM_ZONE_SPREAD, NULL); - - /* - * Allocate global trace buffers. - */ -#ifdef XFS_ALLOC_TRACE - xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMAP_TRACE - xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_BMBT_TRACE - xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_ATTR_TRACE - xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP); -#endif -#ifdef XFS_DIR2_TRACE - xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP); -#endif - - xfs_dir_startup(); - -#if (defined(DEBUG) || defined(INDUCE_IO_ERROR)) - xfs_error_test_init(); -#endif /* DEBUG || INDUCE_IO_ERROR */ - - xfs_init_procfs(); - xfs_sysctl_register(); - return 0; -} - -void -xfs_cleanup(void) -{ - extern kmem_zone_t *xfs_bmap_free_item_zone; - extern kmem_zone_t *xfs_btree_cur_zone; - extern kmem_zone_t *xfs_inode_zone; - extern kmem_zone_t *xfs_trans_zone; - extern kmem_zone_t *xfs_da_state_zone; - extern kmem_zone_t *xfs_dabuf_zone; - extern kmem_zone_t *xfs_efd_zone; - extern kmem_zone_t *xfs_efi_zone; - extern kmem_zone_t *xfs_buf_item_zone; - extern kmem_zone_t *xfs_chashlist_zone; - - xfs_cleanup_procfs(); - xfs_sysctl_unregister(); - xfs_refcache_destroy(); - xfs_acl_zone_destroy(xfs_acl_zone); - -#ifdef XFS_DIR2_TRACE - ktrace_free(xfs_dir2_trace_buf); -#endif -#ifdef XFS_ATTR_TRACE - ktrace_free(xfs_attr_trace_buf); -#endif -#ifdef XFS_BMBT_TRACE - ktrace_free(xfs_bmbt_trace_buf); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(xfs_bmap_trace_buf); -#endif -#ifdef XFS_ALLOC_TRACE - ktrace_free(xfs_alloc_trace_buf); -#endif - - kmem_zone_destroy(xfs_bmap_free_item_zone); - kmem_zone_destroy(xfs_btree_cur_zone); - kmem_zone_destroy(xfs_inode_zone); - kmem_zone_destroy(xfs_trans_zone); - kmem_zone_destroy(xfs_da_state_zone); - kmem_zone_destroy(xfs_dabuf_zone); - kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_efd_zone); - kmem_zone_destroy(xfs_efi_zone); - kmem_zone_destroy(xfs_ifork_zone); - kmem_zone_destroy(xfs_ili_zone); - kmem_zone_destroy(xfs_chashlist_zone); -} - -/* - * xfs_start_flags - * - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock has _not_ yet been read in. - */ -STATIC int -xfs_start_flags( - struct bhv_vfs *vfs, - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - /* Values are in BBs */ - if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - /* - * At this point the superblock has not been read - * in, therefore we do not know the block size. - * Before the mount call ends we will convert - * these to FSBs. - */ - mp->m_dalign = ap->sunit; - mp->m_swidth = ap->swidth; - } - - if (ap->logbufs != -1 && - ap->logbufs != 0 && - (ap->logbufs < XLOG_MIN_ICLOGS || - ap->logbufs > XLOG_MAX_ICLOGS)) { - cmn_err(CE_WARN, - "XFS: invalid logbufs value: %d [not %d-%d]", - ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); - return XFS_ERROR(EINVAL); - } - mp->m_logbufs = ap->logbufs; - if (ap->logbufsize != -1 && - ap->logbufsize != 0 && - ap->logbufsize != 16 * 1024 && - ap->logbufsize != 32 * 1024 && - ap->logbufsize != 64 * 1024 && - ap->logbufsize != 128 * 1024 && - ap->logbufsize != 256 * 1024) { - cmn_err(CE_WARN, - "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", - ap->logbufsize); - return XFS_ERROR(EINVAL); - } - mp->m_ihsize = ap->ihashsize; - mp->m_logbsize = ap->logbufsize; - mp->m_fsname_len = strlen(ap->fsname) + 1; - mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP); - strcpy(mp->m_fsname, ap->fsname); - if (ap->rtname[0]) { - mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP); - strcpy(mp->m_rtname, ap->rtname); - } - if (ap->logname[0]) { - mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP); - strcpy(mp->m_logname, ap->logname); - } - - if (ap->flags & XFSMNT_WSYNC) - mp->m_flags |= XFS_MOUNT_WSYNC; -#if XFS_BIG_INUMS - if (ap->flags & XFSMNT_INO64) { - mp->m_flags |= XFS_MOUNT_INO64; - mp->m_inoadd = XFS_INO64_OFFSET; - } -#endif - if (ap->flags & XFSMNT_RETERR) - mp->m_flags |= XFS_MOUNT_RETERR; - if (ap->flags & XFSMNT_NOALIGN) - mp->m_flags |= XFS_MOUNT_NOALIGN; - if (ap->flags & XFSMNT_SWALLOC) - mp->m_flags |= XFS_MOUNT_SWALLOC; - if (ap->flags & XFSMNT_OSYNCISOSYNC) - mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC; - if (ap->flags & XFSMNT_32BITINODES) - mp->m_flags |= XFS_MOUNT_32BITINODES; - - if (ap->flags & XFSMNT_IOSIZE) { - if (ap->iosizelog > XFS_MAX_IO_LOG || - ap->iosizelog < XFS_MIN_IO_LOG) { - cmn_err(CE_WARN, - "XFS: invalid log iosize: %d [not %d-%d]", - ap->iosizelog, XFS_MIN_IO_LOG, - XFS_MAX_IO_LOG); - return XFS_ERROR(EINVAL); - } - - mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; - mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; - } - - if (ap->flags & XFSMNT_IHASHSIZE) - mp->m_flags |= XFS_MOUNT_IHASHSIZE; - if (ap->flags & XFSMNT_IDELETE) - mp->m_flags |= XFS_MOUNT_IDELETE; - if (ap->flags & XFSMNT_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (ap->flags & XFSMNT_ATTR2) - mp->m_flags |= XFS_MOUNT_ATTR2; - - if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE) - mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; - - /* - * no recovery flag requires a read-only mount - */ - if (ap->flags & XFSMNT_NORECOVERY) { - if (!(vfs->vfs_flag & VFS_RDONLY)) { - cmn_err(CE_WARN, - "XFS: tried to mount a FS read-write without recovery!"); - return XFS_ERROR(EINVAL); - } - mp->m_flags |= XFS_MOUNT_NORECOVERY; - } - - if (ap->flags & XFSMNT_NOUUID) - mp->m_flags |= XFS_MOUNT_NOUUID; - if (ap->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - return 0; -} - -/* - * This function fills in xfs_mount_t fields based on mount args. - * Note: the superblock _has_ now been read in. - */ -STATIC int -xfs_finish_flags( - struct bhv_vfs *vfs, - struct xfs_mount_args *ap, - struct xfs_mount *mp) -{ - int ronly = (vfs->vfs_flag & VFS_RDONLY); - - /* Fail a mount where the logbuf is smaller then the log stripe */ - if (XFS_SB_VERSION_HASLOGV2(&mp->m_sb)) { - if ((ap->logbufsize <= 0) && - (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) { - mp->m_logbsize = mp->m_sb.sb_logsunit; - } else if (ap->logbufsize > 0 && - ap->logbufsize < mp->m_sb.sb_logsunit) { - cmn_err(CE_WARN, - "XFS: logbuf size must be greater than or equal to log stripe size"); - return XFS_ERROR(EINVAL); - } - } else { - /* Fail a mount if the logbuf is larger than 32K */ - if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) { - cmn_err(CE_WARN, - "XFS: logbuf size for version 1 logs must be 16K or 32K"); - return XFS_ERROR(EINVAL); - } - } - - if (XFS_SB_VERSION_HASATTR2(&mp->m_sb)) { - mp->m_flags |= XFS_MOUNT_ATTR2; - } - - /* - * prohibit r/w mounts of read-only filesystems - */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { - cmn_err(CE_WARN, - "XFS: cannot mount a read-only filesystem as read-write"); - return XFS_ERROR(EROFS); - } - - /* - * check for shared mount. - */ - if (ap->flags & XFSMNT_SHARED) { - if (!XFS_SB_VERSION_HASSHARED(&mp->m_sb)) - return XFS_ERROR(EINVAL); - - /* - * For IRIX 6.5, shared mounts must have the shared - * version bit set, have the persistent readonly - * field set, must be version 0 and can only be mounted - * read-only. - */ - if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) || - (mp->m_sb.sb_shared_vn != 0)) - return XFS_ERROR(EINVAL); - - mp->m_flags |= XFS_MOUNT_SHARED; - - /* - * Shared XFS V0 can't deal with DMI. Return EINVAL. - */ - if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI)) - return XFS_ERROR(EINVAL); - } - - return 0; -} - -/* - * xfs_mount - * - * The file system configurations are: - * (1) device (partition) with data and internal log - * (2) logical volume with data and log subvolumes. - * (3) logical volume with data, log, and realtime subvolumes. - * - * We only have to handle opening the log and realtime volumes here if - * they are present. The data subvolume has already been opened by - * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev. - */ -STATIC int -xfs_mount( - struct bhv_desc *bhvp, - struct xfs_mount_args *args, - cred_t *credp) -{ - struct bhv_vfs *vfsp = bhvtovfs(bhvp); - struct bhv_desc *p; - struct xfs_mount *mp = XFS_BHVTOM(bhvp); - struct block_device *ddev, *logdev, *rtdev; - int flags = 0, error; - - ddev = vfsp->vfs_super->s_bdev; - logdev = rtdev = NULL; - - /* - * Setup xfs_mount function vectors from available behaviors - */ - p = vfs_bhv_lookup(vfsp, VFS_POSITION_DM); - mp->m_dm_ops = p ? *(xfs_dmops_t *) vfs_bhv_custom(p) : xfs_dmcore_stub; - p = vfs_bhv_lookup(vfsp, VFS_POSITION_QM); - mp->m_qm_ops = p ? *(xfs_qmops_t *) vfs_bhv_custom(p) : xfs_qmcore_stub; - p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); - mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; - - if (args->flags & XFSMNT_QUIET) - flags |= XFS_MFSI_QUIET; - - /* - * Open real time and log devices - order is important. - */ - if (args->logname[0]) { - error = xfs_blkdev_get(mp, args->logname, &logdev); - if (error) - return error; - } - if (args->rtname[0]) { - error = xfs_blkdev_get(mp, args->rtname, &rtdev); - if (error) { - xfs_blkdev_put(logdev); - return error; - } - - if (rtdev == ddev || rtdev == logdev) { - cmn_err(CE_WARN, - "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return EINVAL; - } - } - - /* - * Setup xfs_mount buffer target pointers - */ - error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0); - if (!mp->m_ddev_targp) { - xfs_blkdev_put(logdev); - xfs_blkdev_put(rtdev); - return error; - } - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1); - if (!mp->m_rtdev_targp) - goto error0; - } - mp->m_logdev_targp = (logdev && logdev != ddev) ? - xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp; - if (!mp->m_logdev_targp) - goto error0; - - /* - * Setup flags based on mount(2) options and then the superblock - */ - error = xfs_start_flags(vfsp, args, mp); - if (error) - goto error1; - error = xfs_readsb(mp, flags); - if (error) - goto error1; - error = xfs_finish_flags(vfsp, args, mp); - if (error) - goto error2; - - /* - * Setup xfs_mount buffer target pointers based on superblock - */ - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (!error && logdev && logdev != ddev) { - unsigned int log_sector_size = BBSIZE; - - if (XFS_SB_VERSION_HASSECTOR(&mp->m_sb)) - log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, - mp->m_sb.sb_blocksize, - log_sector_size); - } - if (!error && rtdev) - error = xfs_setsize_buftarg(mp->m_rtdev_targp, - mp->m_sb.sb_blocksize, - mp->m_sb.sb_sectsize); - if (error) - goto error2; - - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_mountfs_check_barriers(mp); - - error = XFS_IOINIT(vfsp, args, flags); - if (error) - goto error2; - - return 0; - -error2: - if (mp->m_sb_bp) - xfs_freesb(mp); -error1: - xfs_binval(mp->m_ddev_targp); - if (logdev && logdev != ddev) - xfs_binval(mp->m_logdev_targp); - if (rtdev) - xfs_binval(mp->m_rtdev_targp); -error0: - xfs_unmountfs_close(mp, credp); - return error; -} - -STATIC int -xfs_unmount( - bhv_desc_t *bdp, - int flags, - cred_t *credp) -{ - bhv_vfs_t *vfsp = bhvtovfs(bdp); - xfs_mount_t *mp = XFS_BHVTOM(bdp); - xfs_inode_t *rip; - bhv_vnode_t *rvp; - int unmount_event_wanted = 0; - int unmount_event_flags = 0; - int xfs_unmountfs_needed = 0; - int error; - - rip = mp->m_rootip; - rvp = XFS_ITOV(rip); - - if (vfsp->vfs_flag & VFS_DMI) { - error = XFS_SEND_PREUNMOUNT(mp, vfsp, - rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL, - NULL, NULL, 0, 0, - (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))? - 0:DM_FLAGS_UNWANTED); - if (error) - return XFS_ERROR(error); - unmount_event_wanted = 1; - unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))? - 0 : DM_FLAGS_UNWANTED; - } - - /* - * First blow any referenced inode from this file system - * out of the reference cache, and delete the timer. - */ - xfs_refcache_purge_mp(mp); - - XFS_bflush(mp->m_ddev_targp); - error = xfs_unmount_flush(mp, 0); - if (error) - goto out; - - ASSERT(vn_count(rvp) == 1); - - /* - * Drop the reference count - */ - VN_RELE(rvp); - - /* - * If we're forcing a shutdown, typically because of a media error, - * we want to make sure we invalidate dirty pages that belong to - * referenced vnodes as well. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - error = xfs_sync(&mp->m_bhv, - (SYNC_WAIT | SYNC_CLOSE), credp); - ASSERT(error != EFSCORRUPTED); - } - xfs_unmountfs_needed = 1; - -out: - /* Send DMAPI event, if required. - * Then do xfs_unmountfs() if needed. - * Then return error (or zero). - */ - if (unmount_event_wanted) { - /* Note: mp structure must still exist for - * XFS_SEND_UNMOUNT() call. - */ - XFS_SEND_UNMOUNT(mp, vfsp, error == 0 ? rvp : NULL, - DM_RIGHT_NULL, 0, error, unmount_event_flags); - } - if (xfs_unmountfs_needed) { - /* - * Call common unmount function to flush to disk - * and free the super block buffer & mount structures. - */ - xfs_unmountfs(mp, credp); - } - - return XFS_ERROR(error); -} - -STATIC int -xfs_quiesce_fs( - xfs_mount_t *mp) -{ - int count = 0, pincount; - - xfs_refcache_purge_mp(mp); - xfs_flush_buftarg(mp->m_ddev_targp, 0); - xfs_finish_reclaim_all(mp, 0); - - /* This loop must run at least twice. - * The first instance of the loop will flush - * most meta data but that will generate more - * meta data (typically directory updates). - * Which then must be flushed and logged before - * we can write the unmount record. - */ - do { - xfs_syncsub(mp, SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT, 0, NULL); - pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1); - if (!pincount) { - delay(50); - count++; - } - } while (count < 2); - - return 0; -} - -STATIC int -xfs_mntupdate( - bhv_desc_t *bdp, - int *flags, - struct xfs_mount_args *args) -{ - bhv_vfs_t *vfsp = bhvtovfs(bdp); - xfs_mount_t *mp = XFS_BHVTOM(bdp); - - if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */ - if (vfsp->vfs_flag & VFS_RDONLY) - vfsp->vfs_flag &= ~VFS_RDONLY; - if (args->flags & XFSMNT_BARRIER) { - mp->m_flags |= XFS_MOUNT_BARRIER; - xfs_mountfs_check_barriers(mp); - } else { - mp->m_flags &= ~XFS_MOUNT_BARRIER; - } - } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ - bhv_vfs_sync(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL); - xfs_quiesce_fs(mp); - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); - vfsp->vfs_flag |= VFS_RDONLY; - } - return 0; -} - -/* - * xfs_unmount_flush implements a set of flush operation on special - * inodes, which are needed as a separate set of operations so that - * they can be called as part of relocation process. - */ -int -xfs_unmount_flush( - xfs_mount_t *mp, /* Mount structure we are getting - rid of. */ - int relocation) /* Called from vfs relocation. */ -{ - xfs_inode_t *rip = mp->m_rootip; - xfs_inode_t *rbmip; - xfs_inode_t *rsumip = NULL; - bhv_vnode_t *rvp = XFS_ITOV(rip); - int error; - - xfs_ilock(rip, XFS_ILOCK_EXCL); - xfs_iflock(rip); - - /* - * Flush out the real time inodes. - */ - if ((rbmip = mp->m_rbmip) != NULL) { - xfs_ilock(rbmip, XFS_ILOCK_EXCL); - xfs_iflock(rbmip); - error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC); - xfs_iunlock(rbmip, XFS_ILOCK_EXCL); - - if (error == EFSCORRUPTED) - goto fscorrupt_out; - - ASSERT(vn_count(XFS_ITOV(rbmip)) == 1); - - rsumip = mp->m_rsumip; - xfs_ilock(rsumip, XFS_ILOCK_EXCL); - xfs_iflock(rsumip); - error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC); - xfs_iunlock(rsumip, XFS_ILOCK_EXCL); - - if (error == EFSCORRUPTED) - goto fscorrupt_out; - - ASSERT(vn_count(XFS_ITOV(rsumip)) == 1); - } - - /* - * Synchronously flush root inode to disk - */ - error = xfs_iflush(rip, XFS_IFLUSH_SYNC); - if (error == EFSCORRUPTED) - goto fscorrupt_out2; - - if (vn_count(rvp) != 1 && !relocation) { - xfs_iunlock(rip, XFS_ILOCK_EXCL); - return XFS_ERROR(EBUSY); - } - - /* - * Release dquot that rootinode, rbmino and rsumino might be holding, - * flush and purge the quota inodes. - */ - error = XFS_QM_UNMOUNT(mp); - if (error == EFSCORRUPTED) - goto fscorrupt_out2; - - if (rbmip) { - VN_RELE(XFS_ITOV(rbmip)); - VN_RELE(XFS_ITOV(rsumip)); - } - - xfs_iunlock(rip, XFS_ILOCK_EXCL); - return 0; - -fscorrupt_out: - xfs_ifunlock(rip); - -fscorrupt_out2: - xfs_iunlock(rip, XFS_ILOCK_EXCL); - - return XFS_ERROR(EFSCORRUPTED); -} - -/* - * xfs_root extracts the root vnode from a vfs. - * - * vfsp -- the vfs struct for the desired file system - * vpp -- address of the caller's vnode pointer which should be - * set to the desired fs root vnode - */ -STATIC int -xfs_root( - bhv_desc_t *bdp, - bhv_vnode_t **vpp) -{ - bhv_vnode_t *vp; - - vp = XFS_ITOV((XFS_BHVTOM(bdp))->m_rootip); - VN_HOLD(vp); - *vpp = vp; - return 0; -} - -/* - * xfs_statvfs - * - * Fill in the statvfs structure for the given file system. We use - * the superblock lock in the mount structure to ensure a consistent - * snapshot of the counters returned. - */ -STATIC int -xfs_statvfs( - bhv_desc_t *bdp, - bhv_statvfs_t *statp, - bhv_vnode_t *vp) -{ - __uint64_t fakeinos; - xfs_extlen_t lsize; - xfs_mount_t *mp; - xfs_sb_t *sbp; - unsigned long s; - - mp = XFS_BHVTOM(bdp); - sbp = &(mp->m_sb); - - statp->f_type = XFS_SB_MAGIC; - - xfs_icsb_sync_counters_lazy(mp); - s = XFS_SB_LOCK(mp); - statp->f_bsize = sbp->sb_blocksize; - lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; - statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - fakeinos = statp->f_bfree << sbp->sb_inopblog; -#if XFS_BIG_INUMS - fakeinos += mp->m_inoadd; -#endif - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); - if (mp->m_maxicount) -#if XFS_BIG_INUMS - if (!mp->m_inoadd) -#endif - statp->f_files = min_t(typeof(statp->f_files), - statp->f_files, - mp->m_maxicount); - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); - XFS_SB_UNLOCK(mp, s); - - xfs_statvfs_fsid(statp, mp); - statp->f_namelen = MAXNAMELEN - 1; - - return 0; -} - - -/* - * xfs_sync flushes any pending I/O to file system vfsp. - * - * This routine is called by vfs_sync() to make sure that things make it - * out to disk eventually, on sync() system calls to flush out everything, - * and when the file system is unmounted. For the vfs_sync() case, all - * we really need to do is sync out the log to make all of our meta-data - * updates permanent (except for timestamps). For calls from pflushd(), - * dirty pages are kept moving by calling pdflush() on the inodes - * containing them. We also flush the inodes that we can lock without - * sleeping and the superblock if we can lock it without sleeping from - * vfs_sync() so that items at the tail of the log are always moving out. - * - * Flags: - * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want - * to sleep if we can help it. All we really need - * to do is ensure that the log is synced at least - * periodically. We also push the inodes and - * superblock if we can lock them without sleeping - * and they are not pinned. - * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not - * set, then we really want to lock each inode and flush - * it. - * SYNC_WAIT - All the flushes that take place in this call should - * be synchronous. - * SYNC_DELWRI - This tells us to push dirty pages associated with - * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to - * determine if they should be flushed sync, async, or - * delwri. - * SYNC_CLOSE - This flag is passed when the system is being - * unmounted. We should sync and invalidate everything. - * SYNC_FSDATA - This indicates that the caller would like to make - * sure the superblock is safe on disk. We can ensure - * this by simply making sure the log gets flushed - * if SYNC_BDFLUSH is set, and by actually writing it - * out otherwise. - * - */ -/*ARGSUSED*/ -STATIC int -xfs_sync( - bhv_desc_t *bdp, - int flags, - cred_t *credp) -{ - xfs_mount_t *mp = XFS_BHVTOM(bdp); - - if (unlikely(flags == SYNC_QUIESCE)) - return xfs_quiesce_fs(mp); - else - return xfs_syncsub(mp, flags, 0, NULL); -} - -/* - * xfs sync routine for internal use - * - * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. - * - */ -int -xfs_sync_inodes( - xfs_mount_t *mp, - int flags, - int xflags, - int *bypassed) -{ - xfs_inode_t *ip = NULL; - xfs_inode_t *ip_next; - xfs_buf_t *bp; - bhv_vnode_t *vp = NULL; - int error; - int last_error; - uint64_t fflag; - uint lock_flags; - uint base_lock_flags; - boolean_t mount_locked; - boolean_t vnode_refed; - int preempt; - xfs_dinode_t *dip; - xfs_iptr_t *ipointer; -#ifdef DEBUG - boolean_t ipointer_in = B_FALSE; - -#define IPOINTER_SET ipointer_in = B_TRUE -#define IPOINTER_CLR ipointer_in = B_FALSE -#else -#define IPOINTER_SET -#define IPOINTER_CLR -#endif - - -/* Insert a marker record into the inode list after inode ip. The list - * must be locked when this is called. After the call the list will no - * longer be locked. - */ -#define IPOINTER_INSERT(ip, mp) { \ - ASSERT(ipointer_in == B_FALSE); \ - ipointer->ip_mnext = ip->i_mnext; \ - ipointer->ip_mprev = ip; \ - ip->i_mnext = (xfs_inode_t *)ipointer; \ - ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \ - preempt = 0; \ - XFS_MOUNT_IUNLOCK(mp); \ - mount_locked = B_FALSE; \ - IPOINTER_SET; \ - } - -/* Remove the marker from the inode list. If the marker was the only item - * in the list then there are no remaining inodes and we should zero out - * the whole list. If we are the current head of the list then move the head - * past us. - */ -#define IPOINTER_REMOVE(ip, mp) { \ - ASSERT(ipointer_in == B_TRUE); \ - if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \ - ip = ipointer->ip_mnext; \ - ip->i_mprev = ipointer->ip_mprev; \ - ipointer->ip_mprev->i_mnext = ip; \ - if (mp->m_inodes == (xfs_inode_t *)ipointer) { \ - mp->m_inodes = ip; \ - } \ - } else { \ - ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \ - mp->m_inodes = NULL; \ - ip = NULL; \ - } \ - IPOINTER_CLR; \ - } - -#define XFS_PREEMPT_MASK 0x7f - - if (bypassed) - *bypassed = 0; - if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY) - return 0; - error = 0; - last_error = 0; - preempt = 0; - - /* Allocate a reference marker */ - ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP); - - fflag = XFS_B_ASYNC; /* default is don't wait */ - if (flags & (SYNC_BDFLUSH | SYNC_DELWRI)) - fflag = XFS_B_DELWRI; - if (flags & SYNC_WAIT) - fflag = 0; /* synchronous overrides all */ - - base_lock_flags = XFS_ILOCK_SHARED; - if (flags & (SYNC_DELWRI | SYNC_CLOSE)) { - /* - * We need the I/O lock if we're going to call any of - * the flush/inval routines. - */ - base_lock_flags |= XFS_IOLOCK_SHARED; - } - - XFS_MOUNT_ILOCK(mp); - - ip = mp->m_inodes; - - mount_locked = B_TRUE; - vnode_refed = B_FALSE; - - IPOINTER_CLR; - - do { - ASSERT(ipointer_in == B_FALSE); - ASSERT(vnode_refed == B_FALSE); - - lock_flags = base_lock_flags; - - /* - * There were no inodes in the list, just break out - * of the loop. - */ - if (ip == NULL) { - break; - } - - /* - * We found another sync thread marker - skip it - */ - if (ip->i_mount == NULL) { - ip = ip->i_mnext; - continue; - } - - vp = XFS_ITOV_NULL(ip); - - /* - * If the vnode is gone then this is being torn down, - * call reclaim if it is flushed, else let regular flush - * code deal with it later in the loop. - */ - - if (vp == NULL) { - /* Skip ones already in reclaim */ - if (ip->i_flags & XFS_IRECLAIM) { - ip = ip->i_mnext; - continue; - } - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) { - ip = ip->i_mnext; - } else if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - IPOINTER_INSERT(ip, mp); - - xfs_finish_reclaim(ip, 1, - XFS_IFLUSH_DELWRI_ELSE_ASYNC); - - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - IPOINTER_REMOVE(ip, mp); - } else { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - ip = ip->i_mnext; - } - continue; - } - - if (VN_BAD(vp)) { - ip = ip->i_mnext; - continue; - } - - if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) { - XFS_MOUNT_IUNLOCK(mp); - kmem_free(ipointer, sizeof(xfs_iptr_t)); - return 0; - } - - /* - * If this is just vfs_sync() or pflushd() calling - * then we can skip inodes for which it looks like - * there is nothing to do. Since we don't have the - * inode locked this is racy, but these are periodic - * calls so it doesn't matter. For the others we want - * to know for sure, so we at least try to lock them. - */ - if (flags & SYNC_BDFLUSH) { - if (((ip->i_itemp == NULL) || - !(ip->i_itemp->ili_format.ilf_fields & - XFS_ILOG_ALL)) && - (ip->i_update_core == 0)) { - ip = ip->i_mnext; - continue; - } - } - - /* - * Try to lock without sleeping. We're out of order with - * the inode list lock here, so if we fail we need to drop - * the mount lock and try again. If we're called from - * bdflush() here, then don't bother. - * - * The inode lock here actually coordinates with the - * almost spurious inode lock in xfs_ireclaim() to prevent - * the vnode we handle here without a reference from - * being freed while we reference it. If we lock the inode - * while it's on the mount list here, then the spurious inode - * lock in xfs_ireclaim() after the inode is pulled from - * the mount list will sleep until we release it here. - * This keeps the vnode from being freed while we reference - * it. - */ - if (xfs_ilock_nowait(ip, lock_flags) == 0) { - if ((flags & SYNC_BDFLUSH) || (vp == NULL)) { - ip = ip->i_mnext; - continue; - } - - vp = vn_grab(vp); - if (vp == NULL) { - ip = ip->i_mnext; - continue; - } - - IPOINTER_INSERT(ip, mp); - xfs_ilock(ip, lock_flags); - - ASSERT(vp == XFS_ITOV(ip)); - ASSERT(ip->i_mount == mp); - - vnode_refed = B_TRUE; - } - - /* From here on in the loop we may have a marker record - * in the inode list. - */ - - if ((flags & SYNC_CLOSE) && (vp != NULL)) { - /* - * This is the shutdown case. We just need to - * flush and invalidate all the pages associated - * with the inode. Drop the inode lock since - * we can't hold it across calls to the buffer - * cache. - * - * We don't set the VREMAPPING bit in the vnode - * here, because we don't hold the vnode lock - * exclusively. It doesn't really matter, though, - * because we only come here when we're shutting - * down anyway. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (XFS_FORCED_SHUTDOWN(mp)) { - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); - } else { - bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF); - } - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { - if (VN_DIRTY(vp)) { - /* We need to have dropped the lock here, - * so insert a marker if we have not already - * done so. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * Drop the inode lock since we can't hold it - * across calls to the buffer cache. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - error = bhv_vop_flush_pages(vp, (xfs_off_t)0, - -1, fflag, FI_NONE); - xfs_ilock(ip, XFS_ILOCK_SHARED); - } - - } - - if (flags & SYNC_BDFLUSH) { - if ((flags & SYNC_ATTR) && - ((ip->i_update_core) || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0)))) { - - /* Insert marker and drop lock if not already - * done. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * We don't want the periodic flushing of the - * inodes by vfs_sync() to interfere with - * I/O to the file, especially read I/O - * where it is only the access time stamp - * that is being flushed out. To prevent - * long periods where we have both inode - * locks held shared here while reading the - * inode's buffer in from disk, we drop the - * inode lock while reading in the inode - * buffer. We have to release the buffer - * and reacquire the inode lock so that they - * are acquired in the proper order (inode - * locks first). The buffer will go at the - * end of the lru chain, though, so we can - * expect it to still be there when we go - * for it again in xfs_iflush(). - */ - if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - error = xfs_itobp(mp, NULL, ip, - &dip, &bp, 0, 0); - if (!error) { - xfs_buf_relse(bp); - } else { - /* Bailing out, remove the - * marker and free it. - */ - XFS_MOUNT_ILOCK(mp); - IPOINTER_REMOVE(ip, mp); - XFS_MOUNT_IUNLOCK(mp); - - ASSERT(!(lock_flags & - XFS_IOLOCK_SHARED)); - - kmem_free(ipointer, - sizeof(xfs_iptr_t)); - return (0); - } - - /* - * Since we dropped the inode lock, - * the inode may have been reclaimed. - * Therefore, we reacquire the mount - * lock and check to see if we were the - * inode reclaimed. If this happened - * then the ipointer marker will no - * longer point back at us. In this - * case, move ip along to the inode - * after the marker, remove the marker - * and continue. - */ - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - - if (ip != ipointer->ip_mprev) { - IPOINTER_REMOVE(ip, mp); - - ASSERT(!vnode_refed); - ASSERT(!(lock_flags & - XFS_IOLOCK_SHARED)); - continue; - } - - ASSERT(ip->i_mount == mp); - - if (xfs_ilock_nowait(ip, - XFS_ILOCK_SHARED) == 0) { - ASSERT(ip->i_mount == mp); - /* - * We failed to reacquire - * the inode lock without - * sleeping, so just skip - * the inode for now. We - * clear the ILOCK bit from - * the lock_flags so that we - * won't try to drop a lock - * we don't hold below. - */ - lock_flags &= ~XFS_ILOCK_SHARED; - IPOINTER_REMOVE(ip_next, mp); - } else if ((xfs_ipincount(ip) == 0) && - xfs_iflock_nowait(ip)) { - ASSERT(ip->i_mount == mp); - /* - * Since this is vfs_sync() - * calling we only flush the - * inode out if we can lock - * it without sleeping and - * it is not pinned. Drop - * the mount lock here so - * that we don't hold it for - * too long. We already have - * a marker in the list here. - */ - XFS_MOUNT_IUNLOCK(mp); - mount_locked = B_FALSE; - error = xfs_iflush(ip, - XFS_IFLUSH_DELWRI); - } else { - ASSERT(ip->i_mount == mp); - IPOINTER_REMOVE(ip_next, mp); - } - } - - } - - } else { - if ((flags & SYNC_ATTR) && - ((ip->i_update_core) || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0)))) { - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - if (flags & SYNC_WAIT) { - xfs_iflock(ip); - error = xfs_iflush(ip, - XFS_IFLUSH_SYNC); - } else { - /* - * If we can't acquire the flush - * lock, then the inode is already - * being flushed so don't bother - * waiting. If we can lock it then - * do a delwri flush so we can - * combine multiple inode flushes - * in each disk write. - */ - if (xfs_iflock_nowait(ip)) { - error = xfs_iflush(ip, - XFS_IFLUSH_DELWRI); - } - else if (bypassed) - (*bypassed)++; - } - } - } - - if (lock_flags != 0) { - xfs_iunlock(ip, lock_flags); - } - - if (vnode_refed) { - /* - * If we had to take a reference on the vnode - * above, then wait until after we've unlocked - * the inode to release the reference. This is - * because we can be already holding the inode - * lock when VN_RELE() calls xfs_inactive(). - * - * Make sure to drop the mount lock before calling - * VN_RELE() so that we don't trip over ourselves if - * we have to go for the mount lock again in the - * inactive code. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - VN_RELE(vp); - - vnode_refed = B_FALSE; - } - - if (error) { - last_error = error; - } - - /* - * bail out if the filesystem is corrupted. - */ - if (error == EFSCORRUPTED) { - if (!mount_locked) { - XFS_MOUNT_ILOCK(mp); - IPOINTER_REMOVE(ip, mp); - } - XFS_MOUNT_IUNLOCK(mp); - ASSERT(ipointer_in == B_FALSE); - kmem_free(ipointer, sizeof(xfs_iptr_t)); - return XFS_ERROR(error); - } - - /* Let other threads have a chance at the mount lock - * if we have looped many times without dropping the - * lock. - */ - if ((++preempt & XFS_PREEMPT_MASK) == 0) { - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - } - - if (mount_locked == B_FALSE) { - XFS_MOUNT_ILOCK(mp); - mount_locked = B_TRUE; - IPOINTER_REMOVE(ip, mp); - continue; - } - - ASSERT(ipointer_in == B_FALSE); - ip = ip->i_mnext; - - } while (ip != mp->m_inodes); - - XFS_MOUNT_IUNLOCK(mp); - - ASSERT(ipointer_in == B_FALSE); - - kmem_free(ipointer, sizeof(xfs_iptr_t)); - return XFS_ERROR(last_error); -} - -/* - * xfs sync routine for internal use - * - * This routine supports all of the flags defined for the generic vfs_sync - * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internally- - * required functions are specified within a separate xflags parameter, - * only available by calling this routine. - * - */ -int -xfs_syncsub( - xfs_mount_t *mp, - int flags, - int xflags, - int *bypassed) -{ - int error = 0; - int last_error = 0; - uint log_flags = XFS_LOG_FORCE; - xfs_buf_t *bp; - xfs_buf_log_item_t *bip; - - /* - * Sync out the log. This ensures that the log is periodically - * flushed even if there is not enough activity to fill it up. - */ - if (flags & SYNC_WAIT) - log_flags |= XFS_LOG_SYNC; - - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - - if (flags & (SYNC_ATTR|SYNC_DELWRI)) { - if (flags & SYNC_BDFLUSH) - xfs_finish_reclaim_all(mp, 1); - else - error = xfs_sync_inodes(mp, flags, xflags, bypassed); - } - - /* - * Flushing out dirty data above probably generated more - * log activity, so if this isn't vfs_sync() then flush - * the log again. - */ - if (flags & SYNC_DELWRI) { - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - } - - if (flags & SYNC_FSDATA) { - /* - * If this is vfs_sync() then only sync the superblock - * if we can lock it without sleeping and it is not pinned. - */ - if (flags & SYNC_BDFLUSH) { - bp = xfs_getsb(mp, XFS_BUF_TRYLOCK); - if (bp != NULL) { - bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*); - if ((bip != NULL) && - xfs_buf_item_dirty(bip)) { - if (!(XFS_BUF_ISPINNED(bp))) { - XFS_BUF_ASYNC(bp); - error = xfs_bwrite(mp, bp); - } else { - xfs_buf_relse(bp); - } - } else { - xfs_buf_relse(bp); - } - } - } else { - bp = xfs_getsb(mp, 0); - /* - * If the buffer is pinned then push on the log so - * we won't get stuck waiting in the write for - * someone, maybe ourselves, to flush the log. - * Even though we just pushed the log above, we - * did not have the superblock buffer locked at - * that point so it can become pinned in between - * there and here. - */ - if (XFS_BUF_ISPINNED(bp)) - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); - if (flags & SYNC_WAIT) - XFS_BUF_UNASYNC(bp); - else - XFS_BUF_ASYNC(bp); - error = xfs_bwrite(mp, bp); - } - if (error) { - last_error = error; - } - } - - /* - * If this is the periodic sync, then kick some entries out of - * the reference cache. This ensures that idle entries are - * eventually kicked out of the cache. - */ - if (flags & SYNC_REFCACHE) { - if (flags & SYNC_WAIT) - xfs_refcache_purge_mp(mp); - else - xfs_refcache_purge_some(mp); - } - - /* - * Now check to see if the log needs a "dummy" transaction. - */ - - if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { - xfs_trans_t *tp; - xfs_inode_t *ip; - - /* - * Put a dummy transaction in the log to tell - * recovery that all others are OK. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - if ((error = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_log_force(mp, (xfs_lsn_t)0, log_flags); - } - - /* - * When shutting down, we need to insure that the AIL is pushed - * to disk or the filesystem can appear corrupt from the PROM. - */ - if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) { - XFS_bflush(mp->m_ddev_targp); - if (mp->m_rtdev_targp) { - XFS_bflush(mp->m_rtdev_targp); - } - } - - return XFS_ERROR(last_error); -} - -/* - * xfs_vget - called by DMAPI and NFSD to get vnode from file handle - */ -STATIC int -xfs_vget( - bhv_desc_t *bdp, - bhv_vnode_t **vpp, - fid_t *fidp) -{ - xfs_mount_t *mp = XFS_BHVTOM(bdp); - xfs_fid_t *xfid = (struct xfs_fid *)fidp; - xfs_inode_t *ip; - int error; - xfs_ino_t ino; - unsigned int igen; - - /* - * Invalid. Since handles can be created in user space and passed in - * via gethandle(), this is not cause for a panic. - */ - if (xfid->xfs_fid_len != sizeof(*xfid) - sizeof(xfid->xfs_fid_len)) - return XFS_ERROR(EINVAL); - - ino = xfid->xfs_fid_ino; - igen = xfid->xfs_fid_gen; - - /* - * NFS can sometimes send requests for ino 0. Fail them gracefully. - */ - if (ino == 0) - return XFS_ERROR(ESTALE); - - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0); - if (error) { - *vpp = NULL; - return error; - } - - if (ip == NULL) { - *vpp = NULL; - return XFS_ERROR(EIO); - } - - if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { - xfs_iput_new(ip, XFS_ILOCK_SHARED); - *vpp = NULL; - return XFS_ERROR(ENOENT); - } - - *vpp = XFS_ITOV(ip); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return 0; -} - - -#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ -#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ -#define MNTOPT_LOGDEV "logdev" /* log device */ -#define MNTOPT_RTDEV "rtdev" /* realtime I/O device */ -#define MNTOPT_BIOSIZE "biosize" /* log2 of preferred buffered io size */ -#define MNTOPT_WSYNC "wsync" /* safe-mode nfs compatible mount */ -#define MNTOPT_INO64 "ino64" /* force inodes into 64-bit range */ -#define MNTOPT_NOALIGN "noalign" /* turn off stripe alignment */ -#define MNTOPT_SWALLOC "swalloc" /* turn on stripe width allocation */ -#define MNTOPT_SUNIT "sunit" /* data volume stripe unit */ -#define MNTOPT_SWIDTH "swidth" /* data volume stripe width */ -#define MNTOPT_NOUUID "nouuid" /* ignore filesystem UUID */ -#define MNTOPT_MTPT "mtpt" /* filesystem mount point */ -#define MNTOPT_GRPID "grpid" /* group-ID from parent directory */ -#define MNTOPT_NOGRPID "nogrpid" /* group-ID from current process */ -#define MNTOPT_BSDGROUPS "bsdgroups" /* group-ID from parent directory */ -#define MNTOPT_SYSVGROUPS "sysvgroups" /* group-ID from current process */ -#define MNTOPT_ALLOCSIZE "allocsize" /* preferred allocation size */ -#define MNTOPT_IHASHSIZE "ihashsize" /* size of inode hash table */ -#define MNTOPT_NORECOVERY "norecovery" /* don't run XFS recovery */ -#define MNTOPT_BARRIER "barrier" /* use writer barriers for log write and - * unwritten extent conversion */ -#define MNTOPT_NOBARRIER "nobarrier" /* .. disable */ -#define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ -#define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ -#define MNTOPT_IKEEP "ikeep" /* do not free empty inode clusters */ -#define MNTOPT_NOIKEEP "noikeep" /* free empty inode clusters */ -#define MNTOPT_LARGEIO "largeio" /* report large I/O sizes in stat() */ -#define MNTOPT_NOLARGEIO "nolargeio" /* do not report large I/O sizes - * in stat(). */ -#define MNTOPT_ATTR2 "attr2" /* do use attr2 attribute format */ -#define MNTOPT_NOATTR2 "noattr2" /* do not use attr2 attribute format */ - -STATIC unsigned long -suffix_strtoul(char *s, char **endp, unsigned int base) -{ - int last, shift_left_factor = 0; - char *value = s; - - last = strlen(value) - 1; - if (value[last] == 'K' || value[last] == 'k') { - shift_left_factor = 10; - value[last] = '\0'; - } - if (value[last] == 'M' || value[last] == 'm') { - shift_left_factor = 20; - value[last] = '\0'; - } - if (value[last] == 'G' || value[last] == 'g') { - shift_left_factor = 30; - value[last] = '\0'; - } - - return simple_strtoul((const char *)s, endp, base) << shift_left_factor; -} - -STATIC int -xfs_parseargs( - struct bhv_desc *bhv, - char *options, - struct xfs_mount_args *args, - int update) -{ - bhv_vfs_t *vfsp = bhvtovfs(bhv); - char *this_char, *value, *eov; - int dsunit, dswidth, vol_dsunit, vol_dswidth; - int iosize; - - args->flags |= XFSMNT_IDELETE; - args->flags |= XFSMNT_BARRIER; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; - - if (!options) - goto done; - - iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0; - - while ((this_char = strsep(&options, ",")) != NULL) { - if (!*this_char) - continue; - if ((value = strchr(this_char, '=')) != NULL) - *value++ = 0; - - if (!strcmp(this_char, MNTOPT_LOGBUFS)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->logbufs = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->logbufsize = suffix_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->logname, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_MTPT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->mtpt, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_RTDEV)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - strncpy(args->rtname, value, MAXNAMELEN); - } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - iosize = simple_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = (uint8_t) iosize; - } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - iosize = suffix_strtoul(value, &eov, 10); - args->flags |= XFSMNT_IOSIZE; - args->iosizelog = ffs(iosize) - 1; - } else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - args->flags |= XFSMNT_IHASHSIZE; - args->ihashsize = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_GRPID) || - !strcmp(this_char, MNTOPT_BSDGROUPS)) { - vfsp->vfs_flag |= VFS_GRPID; - } else if (!strcmp(this_char, MNTOPT_NOGRPID) || - !strcmp(this_char, MNTOPT_SYSVGROUPS)) { - vfsp->vfs_flag &= ~VFS_GRPID; - } else if (!strcmp(this_char, MNTOPT_WSYNC)) { - args->flags |= XFSMNT_WSYNC; - } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) { - args->flags |= XFSMNT_OSYNCISOSYNC; - } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) { - args->flags |= XFSMNT_NORECOVERY; - } else if (!strcmp(this_char, MNTOPT_INO64)) { - args->flags |= XFSMNT_INO64; -#if !XFS_BIG_INUMS - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", - this_char); - return EINVAL; -#endif - } else if (!strcmp(this_char, MNTOPT_NOALIGN)) { - args->flags |= XFSMNT_NOALIGN; - } else if (!strcmp(this_char, MNTOPT_SWALLOC)) { - args->flags |= XFSMNT_SWALLOC; - } else if (!strcmp(this_char, MNTOPT_SUNIT)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - dsunit = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { - if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", - this_char); - return EINVAL; - } - dswidth = simple_strtoul(value, &eov, 10); - } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { - args->flags &= ~XFSMNT_32BITINODES; -#if !XFS_BIG_INUMS - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", - this_char); - return EINVAL; -#endif - } else if (!strcmp(this_char, MNTOPT_NOUUID)) { - args->flags |= XFSMNT_NOUUID; - } else if (!strcmp(this_char, MNTOPT_BARRIER)) { - args->flags |= XFSMNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) { - args->flags &= ~XFSMNT_BARRIER; - } else if (!strcmp(this_char, MNTOPT_IKEEP)) { - args->flags &= ~XFSMNT_IDELETE; - } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) { - args->flags |= XFSMNT_IDELETE; - } else if (!strcmp(this_char, MNTOPT_LARGEIO)) { - args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) { - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; - } else if (!strcmp(this_char, MNTOPT_ATTR2)) { - args->flags |= XFSMNT_ATTR2; - } else if (!strcmp(this_char, MNTOPT_NOATTR2)) { - args->flags &= ~XFSMNT_ATTR2; - } else if (!strcmp(this_char, "osyncisdsync")) { - /* no-op, this is now the default */ - cmn_err(CE_WARN, - "XFS: osyncisdsync is now the default, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - cmn_err(CE_WARN, - "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); - } else { - cmn_err(CE_WARN, - "XFS: unknown mount option [%s].", this_char); - return EINVAL; - } - } - - if (args->flags & XFSMNT_NORECOVERY) { - if ((vfsp->vfs_flag & VFS_RDONLY) == 0) { - cmn_err(CE_WARN, - "XFS: no-recovery mounts must be read-only."); - return EINVAL; - } - } - - if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth options incompatible with the noalign option"); - return EINVAL; - } - - if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth must be specified together"); - return EINVAL; - } - - if (dsunit && (dswidth % dsunit != 0)) { - cmn_err(CE_WARN, - "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", - dswidth, dsunit); - return EINVAL; - } - - if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) { - if (dsunit) { - args->sunit = dsunit; - args->flags |= XFSMNT_RETERR; - } else { - args->sunit = vol_dsunit; - } - dswidth ? (args->swidth = dswidth) : - (args->swidth = vol_dswidth); - } else { - args->sunit = args->swidth = 0; - } - -done: - if (args->flags & XFSMNT_32BITINODES) - vfsp->vfs_flag |= VFS_32BITINODES; - if (args->flags2) - args->flags |= XFSMNT_FLAGS2; - return 0; -} - -STATIC int -xfs_showargs( - struct bhv_desc *bhv, - struct seq_file *m) -{ - static struct proc_xfs_info { - int flag; - char *str; - } xfs_info[] = { - /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_WSYNC, "," MNTOPT_WSYNC }, - { XFS_MOUNT_INO64, "," MNTOPT_INO64 }, - { XFS_MOUNT_NOALIGN, "," MNTOPT_NOALIGN }, - { XFS_MOUNT_SWALLOC, "," MNTOPT_SWALLOC }, - { XFS_MOUNT_NOUUID, "," MNTOPT_NOUUID }, - { XFS_MOUNT_NORECOVERY, "," MNTOPT_NORECOVERY }, - { XFS_MOUNT_OSYNCISOSYNC, "," MNTOPT_OSYNCISOSYNC }, - { 0, NULL } - }; - struct proc_xfs_info *xfs_infop; - struct xfs_mount *mp = XFS_BHVTOM(bhv); - struct bhv_vfs *vfsp = XFS_MTOVFS(mp); - - for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) { - if (mp->m_flags & xfs_infop->flag) - seq_puts(m, xfs_infop->str); - } - - if (mp->m_flags & XFS_MOUNT_IHASHSIZE) - seq_printf(m, "," MNTOPT_IHASHSIZE "=%d", mp->m_ihsize); - - if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) - seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk", - (int)(1 << mp->m_writeio_log) >> 10); - - if (mp->m_logbufs > 0) - seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs); - if (mp->m_logbsize > 0) - seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10); - - if (mp->m_logname) - seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname); - if (mp->m_rtname) - seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname); - - if (mp->m_dalign > 0) - seq_printf(m, "," MNTOPT_SUNIT "=%d", - (int)XFS_FSB_TO_BB(mp, mp->m_dalign)); - if (mp->m_swidth > 0) - seq_printf(m, "," MNTOPT_SWIDTH "=%d", - (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - - if (!(mp->m_flags & XFS_MOUNT_IDELETE)) - seq_printf(m, "," MNTOPT_IKEEP); - if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) - seq_printf(m, "," MNTOPT_LARGEIO); - - if (!(vfsp->vfs_flag & VFS_32BITINODES)) - seq_printf(m, "," MNTOPT_64BITINODE); - if (vfsp->vfs_flag & VFS_GRPID) - seq_printf(m, "," MNTOPT_GRPID); - - return 0; -} - -STATIC void -xfs_freeze( - bhv_desc_t *bdp) -{ - xfs_mount_t *mp = XFS_BHVTOM(bdp); - - while (atomic_read(&mp->m_active_trans) > 0) - delay(100); - - /* Push the superblock and write an unmount record */ - xfs_log_unmount_write(mp); - xfs_unmountfs_writesb(mp); - xfs_fs_log_dummy(mp); -} - - -bhv_vfsops_t xfs_vfsops = { - BHV_IDENTITY_INIT(VFS_BHV_XFS,VFS_POSITION_XFS), - .vfs_parseargs = xfs_parseargs, - .vfs_showargs = xfs_showargs, - .vfs_mount = xfs_mount, - .vfs_unmount = xfs_unmount, - .vfs_mntupdate = xfs_mntupdate, - .vfs_root = xfs_root, - .vfs_statvfs = xfs_statvfs, - .vfs_sync = xfs_sync, - .vfs_vget = xfs_vget, - .vfs_dmapiops = (vfs_dmapiops_t)fs_nosys, - .vfs_quotactl = (vfs_quotactl_t)fs_nosys, - .vfs_init_vnode = xfs_initialize_vnode, - .vfs_force_shutdown = xfs_do_force_shutdown, - .vfs_freeze = xfs_freeze, -}; diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_vnode.h index 1e4a35ddf7f..e8a77383c0d 100644 --- a/fs/xfs/xfs_dmops.c +++ b/fs/xfs/xfs_vnode.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. + * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or @@ -15,22 +15,32 @@ * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "xfs.h" +#ifndef __XFS_VNODE_H__ +#define __XFS_VNODE_H__ + #include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -xfs_dmops_t xfs_dmcore_stub = { - .xfs_send_data = (xfs_send_data_t)fs_nosys, - .xfs_send_mmap = (xfs_send_mmap_t)fs_noerr, - .xfs_send_destroy = (xfs_send_destroy_t)fs_nosys, - .xfs_send_namesp = (xfs_send_namesp_t)fs_nosys, - .xfs_send_unmount = (xfs_send_unmount_t)fs_noval, -}; +struct file; +struct xfs_inode; +struct attrlist_cursor_kern; + +/* + * Flags for read/write calls - same values as IRIX + */ +#define IO_ISDIRECT 0x00004 /* bypass page cache */ +#define IO_INVIS 0x00020 /* don't update inode timestamps */ + +#define XFS_IO_FLAGS \ + { IO_ISDIRECT, "DIRECT" }, \ + { IO_INVIS, "INVIS"} + +/* + * Some useful predicates. + */ +#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping) +#define VN_CACHED(vp) (vp->i_mapping->nrpages) +#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \ + PAGECACHE_TAG_DIRTY) + + +#endif /* __XFS_VNODE_H__ */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c deleted file mode 100644 index 23cfa583772..00000000000 --- a/fs/xfs/xfs_vnodeops.c +++ /dev/null @@ -1,4687 +0,0 @@ -/* - * Copyright (c) 2000-2006 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "xfs.h" -#include "xfs_fs.h" -#include "xfs_types.h" -#include "xfs_bit.h" -#include "xfs_log.h" -#include "xfs_inum.h" -#include "xfs_trans.h" -#include "xfs_sb.h" -#include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" -#include "xfs_mount.h" -#include "xfs_da_btree.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" -#include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_itable.h" -#include "xfs_btree.h" -#include "xfs_ialloc.h" -#include "xfs_alloc.h" -#include "xfs_bmap.h" -#include "xfs_attr.h" -#include "xfs_rw.h" -#include "xfs_error.h" -#include "xfs_quota.h" -#include "xfs_utils.h" -#include "xfs_rtalloc.h" -#include "xfs_refcache.h" -#include "xfs_trans_space.h" -#include "xfs_log_priv.h" -#include "xfs_mac.h" - -STATIC int -xfs_open( - bhv_desc_t *bdp, - cred_t *credp) -{ - int mode; - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - xfs_inode_t *ip = XFS_BHVTOI(bdp); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * If it's a directory with any blocks, read-ahead block 0 - * as we're almost certain to have the next operation be a read there. - */ - if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) { - mode = xfs_ilock_map_shared(ip); - if (ip->i_d.di_nextents > 0) - (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK); - xfs_iunlock(ip, mode); - } - return 0; -} - -STATIC int -xfs_close( - bhv_desc_t *bdp, - int flags, - lastclose_t lastclose, - cred_t *credp) -{ - bhv_vnode_t *vp = BHV_TO_VNODE(bdp); - xfs_inode_t *ip = XFS_BHVTOI(bdp); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - if (lastclose != L_TRUE || !VN_ISREG(vp)) - return 0; - - /* - * If we previously truncated this file and removed old data in - * the process, we want to initiate "early" writeout on the last - * close. This is an attempt to combat the notorious NULL files - * problem which is particularly noticable from a truncate down, - * buffered (re-)write (delalloc), followed by a crash. What we - * are effectively doing here is significantly reducing the time - * window where we'd otherwise be exposed to that problem. - */ - if (VUNTRUNCATE(vp) && VN_DIRTY(vp) && ip->i_delayed_blks > 0) - return bhv_vop_flush_pages(vp, 0, -1, XFS_B_ASYNC, FI_NONE); - return 0; -} - -/* - * xfs_getattr - */ -STATIC int -xfs_getattr( - bhv_desc_t *bdp, - bhv_vattr_t *vap, - int flags, - cred_t *credp) -{ - xfs_inode_t *ip; - xfs_mount_t *mp; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (!(flags & ATTR_LAZY)) - xfs_ilock(ip, XFS_ILOCK_SHARED); - - vap->va_size = ip->i_d.di_size; - if (vap->va_mask == XFS_AT_SIZE) - goto all_done; - - vap->va_nblocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - vap->va_nodeid = ip->i_ino; -#if XFS_BIG_INUMS - vap->va_nodeid += mp->m_inoadd; -#endif - vap->va_nlink = ip->i_d.di_nlink; - - /* - * Quick exit for non-stat callers - */ - if ((vap->va_mask & - ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID| - XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0) - goto all_done; - - /* - * Copy from in-core inode. - */ - vap->va_mode = ip->i_d.di_mode; - vap->va_uid = ip->i_d.di_uid; - vap->va_gid = ip->i_d.di_gid; - vap->va_projid = ip->i_d.di_projid; - - /* - * Check vnode type block/char vs. everything else. - */ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - vap->va_rdev = ip->i_df.if_u2.if_rdev; - vap->va_blocksize = BLKDEV_IOSIZE; - break; - default: - vap->va_rdev = 0; - - if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) { - vap->va_blocksize = xfs_preferred_iosize(mp); - } else { - - /* - * If the file blocks are being allocated from a - * realtime partition, then return the inode's - * realtime extent size or the realtime volume's - * extent size. - */ - vap->va_blocksize = ip->i_d.di_extsize ? - (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) : - (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog); - } - break; - } - - vn_atime_to_timespec(vp, &vap->va_atime); - vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - - /* - * Exit for stat callers. See if any of the rest of the fields - * to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - /* - * Convert di_flags to xflags. - */ - vap->va_xflags = xfs_ip2xflags(ip); - - /* - * Exit for inode revalidate. See if any of the rest of - * the fields to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog; - vap->va_nextents = - (ip->i_df.if_flags & XFS_IFEXTENTS) ? - ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_nextents; - if (ip->i_afp) - vap->va_anextents = - (ip->i_afp->if_flags & XFS_IFEXTENTS) ? - ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_anextents; - else - vap->va_anextents = 0; - vap->va_gen = ip->i_d.di_gen; - - all_done: - if (!(flags & ATTR_LAZY)) - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return 0; -} - - -/* - * xfs_setattr - */ -int -xfs_setattr( - bhv_desc_t *bdp, - bhv_vattr_t *vap, - int flags, - cred_t *credp) -{ - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int mask; - int code; - uint lock_flags; - uint commit_flags=0; - uid_t uid=0, iuid=0; - gid_t gid=0, igid=0; - int timeflags = 0; - bhv_vnode_t *vp; - xfs_prid_t projid=0, iprojid=0; - int mandlock_before, mandlock_after; - struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; - int file_owner; - int need_iolock = 1; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) - return XFS_ERROR(EROFS); - - /* - * Cannot set certain attributes. - */ - mask = vap->va_mask; - if (mask & XFS_AT_NOSET) { - return XFS_ERROR(EINVAL); - } - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Timestamps do not need to be logged and hence do not - * need to be done within a transaction. - */ - if (mask & XFS_AT_UPDTIMES) { - ASSERT((mask & ~XFS_AT_UPDTIMES) == 0); - timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) | - ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) | - ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0); - xfs_ichgtime(ip, timeflags); - return 0; - } - - olddquot1 = olddquot2 = NULL; - udqp = gdqp = NULL; - - /* - * If disk quotas is on, we make sure that the dquots do exist on disk, - * before we start any other transactions. Trying to do this later - * is messy. We don't care to take a readlock to look at the ids - * in inode here, because we can't hold it across the trans_reserve. - * If the IDs do change before we take the ilock, we're covered - * because the i_*dquot fields will get updated anyway. - */ - if (XFS_IS_QUOTA_ON(mp) && - (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) { - uint qflags = 0; - - if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) { - uid = vap->va_uid; - qflags |= XFS_QMOPT_UQUOTA; - } else { - uid = ip->i_d.di_uid; - } - if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) { - gid = vap->va_gid; - qflags |= XFS_QMOPT_GQUOTA; - } else { - gid = ip->i_d.di_gid; - } - if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) { - projid = vap->va_projid; - qflags |= XFS_QMOPT_PQUOTA; - } else { - projid = ip->i_d.di_projid; - } - /* - * We take a reference when we initialize udqp and gdqp, - * so it is important that we never blindly double trip on - * the same variable. See xfs_create() for an example. - */ - ASSERT(udqp == NULL); - ASSERT(gdqp == NULL); - code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags, - &udqp, &gdqp); - if (code) - return code; - } - - /* - * For the other attributes, we acquire the inode lock and - * first do an error checking pass. - */ - tp = NULL; - lock_flags = XFS_ILOCK_EXCL; - if (flags & ATTR_NOLOCK) - need_iolock = 0; - if (!(mask & XFS_AT_SIZE)) { - if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) || - (mp->m_flags & XFS_MOUNT_WSYNC)) { - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - commit_flags = 0; - if ((code = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - lock_flags = 0; - goto error_return; - } - } - } else { - if (DM_EVENT_ENABLED (vp->v_vfsp, ip, DM_EVENT_TRUNCATE) && - !(flags & ATTR_DMI)) { - int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; - code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp, - vap->va_size, 0, dmflags, NULL); - if (code) { - lock_flags = 0; - goto error_return; - } - } - if (need_iolock) - lock_flags |= XFS_IOLOCK_EXCL; - } - - xfs_ilock(ip, lock_flags); - - /* boolean: are we the file owner? */ - file_owner = (current_fsuid(credp) == ip->i_d.di_uid); - - /* - * Change various properties of a file. - * Only the owner or users with CAP_FOWNER - * capability may do these things. - */ - if (mask & - (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID| - XFS_AT_GID|XFS_AT_PROJID)) { - /* - * CAP_FOWNER overrides the following restrictions: - * - * The user ID of the calling process must be equal - * to the file owner ID, except in cases where the - * CAP_FSETID capability is applicable. - */ - if (!file_owner && !capable(CAP_FOWNER)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * CAP_FSETID overrides the following restrictions: - * - * The effective user ID of the calling process shall match - * the file owner when setting the set-user-ID and - * set-group-ID bits on that file. - * - * The effective group ID or one of the supplementary group - * IDs of the calling process shall match the group owner of - * the file when setting the set-group-ID bit on that file - */ - if (mask & XFS_AT_MODE) { - mode_t m = 0; - - if ((vap->va_mode & S_ISUID) && !file_owner) - m |= S_ISUID; - if ((vap->va_mode & S_ISGID) && - !in_group_p((gid_t)ip->i_d.di_gid)) - m |= S_ISGID; -#if 0 - /* Linux allows this, Irix doesn't. */ - if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp)) - m |= S_ISVTX; -#endif - if (m && !capable(CAP_FSETID)) - vap->va_mode &= ~m; - } - } - - /* - * Change file ownership. Must be the owner or privileged. - * If the system was configured with the "restricted_chown" - * option, the owner is not permitted to give away the file, - * and can change the group id only to a group of which he - * or she is a member. - */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { - /* - * These IDs could have changed since we last looked at them. - * But, we're assured that if the ownership did change - * while we didn't have the inode locked, inode's dquot(s) - * would have changed also. - */ - iuid = ip->i_d.di_uid; - iprojid = ip->i_d.di_projid; - igid = ip->i_d.di_gid; - gid = (mask & XFS_AT_GID) ? vap->va_gid : igid; - uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid; - projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid : - iprojid; - - /* - * CAP_CHOWN overrides the following restrictions: - * - * If _POSIX_CHOWN_RESTRICTED is defined, this capability - * shall override the restriction that a process cannot - * change the user ID of a file it owns and the restriction - * that the group ID supplied to the chown() function - * shall be equal to either the group ID or one of the - * supplementary group IDs of the calling process. - */ - if (restricted_chown && - (iuid != uid || (igid != gid && - !in_group_p((gid_t)gid))) && - !capable(CAP_CHOWN)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - /* - * Do a quota reservation only if uid/projid/gid is actually - * going to change. - */ - if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) || - (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) || - (XFS_IS_GQUOTA_ON(mp) && igid != gid)) { - ASSERT(tp); - code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp, - capable(CAP_FOWNER) ? - XFS_QMOPT_FORCE_RES : 0); - if (code) /* out of quota */ - goto error_return; - } - } - - /* - * Truncate file. Must have write permission and not be a directory. - */ - if (mask & XFS_AT_SIZE) { - /* Short circuit the truncate case for zero length files */ - if ((vap->va_size == 0) && - (ip->i_d.di_size == 0) && (ip->i_d.di_nextents == 0)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - lock_flags &= ~XFS_ILOCK_EXCL; - if (mask & XFS_AT_CTIME) - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - code = 0; - goto error_return; - } - - if (VN_ISDIR(vp)) { - code = XFS_ERROR(EISDIR); - goto error_return; - } else if (!VN_ISREG(vp)) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - /* - * Make sure that the dquots are attached to the inode. - */ - if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED))) - goto error_return; - } - - /* - * Change file access or modified times. - */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { - if (!file_owner) { - if ((flags & ATTR_UTIME) && - !capable(CAP_FOWNER)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - } - } - - /* - * Change extent size or realtime flag. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - /* - * Can't change extent size if any extents are allocated. - */ - if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) && - ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != - vap->va_extsize) ) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - - /* - * Can't change realtime flag if any extents are allocated. - */ - if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - (mask & XFS_AT_XFLAGS) && - (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - code = XFS_ERROR(EINVAL); /* EFBIG? */ - goto error_return; - } - /* - * Extent size must be a multiple of the appropriate block - * size, if set at all. - */ - if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) { - xfs_extlen_t size; - - if ((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) || - ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME))) { - size = mp->m_sb.sb_rextsize << - mp->m_sb.sb_blocklog; - } else { - size = mp->m_sb.sb_blocksize; - } - if (vap->va_extsize % size) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - /* - * If realtime flag is set then must have realtime data. - */ - if ((mask & XFS_AT_XFLAGS) && - (vap->va_xflags & XFS_XFLAG_REALTIME)) { - if ((mp->m_sb.sb_rblocks == 0) || - (mp->m_sb.sb_rextsize == 0) || - (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { - code = XFS_ERROR(EINVAL); - goto error_return; - } - } - - /* - * Can't modify an immutable/append-only file unless - * we have appropriate permission. - */ - if ((mask & XFS_AT_XFLAGS) && - (ip->i_d.di_flags & - (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) || - (vap->va_xflags & - (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && - !capable(CAP_LINUX_IMMUTABLE)) { - code = XFS_ERROR(EPERM); - goto error_return; - } - } - - /* - * Now we can make the changes. Before we join the inode - * to the transaction, if XFS_AT_SIZE is set then take care of - * the part of the truncation that must be done without the - * inode lock. This needs to be done before joining the inode - * to the transaction, because the inode cannot be unlocked - * once it is a part of the transaction. - */ - if (mask & XFS_AT_SIZE) { - code = 0; - if ((vap->va_size > ip->i_d.di_size) && - (flags & ATTR_NOSIZETOK) == 0) { - code = xfs_igrow_start(ip, vap->va_size, credp); - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ - if (!code) - code = xfs_itruncate_data(ip, vap->va_size); - if (code) { - ASSERT(tp == NULL); - lock_flags &= ~XFS_ILOCK_EXCL; - ASSERT(lock_flags == XFS_IOLOCK_EXCL); - goto error_return; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); - if ((code = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT))) { - xfs_trans_cancel(tp, 0); - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return code; - } - commit_flags = XFS_TRANS_RELEASE_LOG_RES; - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - - if (tp) { - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ihold(tp, ip); - } - - /* determine whether mandatory locking mode changes */ - mandlock_before = MANDLOCK(vp, ip->i_d.di_mode); - - /* - * Truncate file. Must have write permission and not be a directory. - */ - if (mask & XFS_AT_SIZE) { - if (vap->va_size > ip->i_d.di_size) { - xfs_igrow_finish(tp, ip, vap->va_size, - !(flags & ATTR_DMI)); - } else if ((vap->va_size <= ip->i_d.di_size) || - ((vap->va_size == 0) && ip->i_d.di_nextents)) { - /* - * signal a sync transaction unless - * we're truncating an already unlinked - * file on a wsync filesystem - */ - code = xfs_itruncate_finish(&tp, ip, - (xfs_fsize_t)vap->va_size, - XFS_DATA_FORK, - ((ip->i_d.di_nlink != 0 || - !(mp->m_flags & XFS_MOUNT_WSYNC)) - ? 1 : 0)); - if (code) - goto abort_return; - /* - * Truncated "down", so we're removing references - * to old data here - if we now delay flushing for - * a long time, we expose ourselves unduly to the - * notorious NULL files problem. So, we mark this - * vnode and flush it when the file is closed, and - * do not wait the usual (long) time for writeout. - */ - VTRUNCATE(vp); - } - /* - * Have to do this even if the file's size doesn't change. - */ - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; - } - - /* - * Change file access modes. - */ - if (mask & XFS_AT_MODE) { - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= vap->va_mode & ~S_IFMT; - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; - } - - /* - * Change file ownership. Must be the owner or privileged. - * If the system was configured with the "restricted_chown" - * option, the owner is not permitted to give away the file, - * and can change the group id only to a group of which he - * or she is a member. - */ - if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) { - /* - * CAP_FSETID overrides the following restrictions: - * - * The set-user-ID and set-group-ID bits of a file will be - * cleared upon successful return from chown() - */ - if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) { - ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); - } - - /* - * Change the ownerships and register quota modifications - * in the transaction. - */ - if (iuid != uid) { - if (XFS_IS_UQUOTA_ON(mp)) { - ASSERT(mask & XFS_AT_UID); - ASSERT(udqp); - olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip, - &ip->i_udquot, udqp); - } - ip->i_d.di_uid = uid; - } - if (igid != gid) { - if (XFS_IS_GQUOTA_ON(mp)) { - ASSERT(!XFS_IS_PQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_GID); - ASSERT(gdqp); - olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_gid = gid; - } - if (iprojid != projid) { - if (XFS_IS_PQUOTA_ON(mp)) { - ASSERT(!XFS_IS_GQUOTA_ON(mp)); - ASSERT(mask & XFS_AT_PROJID); - ASSERT(gdqp); - olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip, - &ip->i_gdquot, gdqp); - } - ip->i_d.di_projid = projid; - /* - * We may have to rev the inode as well as - * the superblock version number since projids didn't - * exist before DINODE_VERSION_2 and SB_VERSION_NLINK. - */ - if (ip->i_d.di_version == XFS_DINODE_VERSION_1) - xfs_bump_ino_vers2(tp, ip); - } - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; - } - - - /* - * Change file access or modified times. - */ - if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) { - if (mask & XFS_AT_ATIME) { - ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec; - ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec; - ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_ACC; - } - if (mask & XFS_AT_MTIME) { - ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec; - timeflags &= ~XFS_ICHGTIME_MOD; - timeflags |= XFS_ICHGTIME_CHG; - } - if (tp && (flags & ATTR_UTIME)) - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - } - - /* - * Change XFS-added attributes. - */ - if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) { - if (mask & XFS_AT_EXTSIZE) { - /* - * Converting bytes to fs blocks. - */ - ip->i_d.di_extsize = vap->va_extsize >> - mp->m_sb.sb_blocklog; - } - if (mask & XFS_AT_XFLAGS) { - uint di_flags; - - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (vap->va_xflags & XFS_XFLAG_IMMUTABLE) - di_flags |= XFS_DIFLAG_IMMUTABLE; - if (vap->va_xflags & XFS_XFLAG_APPEND) - di_flags |= XFS_DIFLAG_APPEND; - if (vap->va_xflags & XFS_XFLAG_SYNC) - di_flags |= XFS_DIFLAG_SYNC; - if (vap->va_xflags & XFS_XFLAG_NOATIME) - di_flags |= XFS_DIFLAG_NOATIME; - if (vap->va_xflags & XFS_XFLAG_NODUMP) - di_flags |= XFS_DIFLAG_NODUMP; - if (vap->va_xflags & XFS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - if (vap->va_xflags & XFS_XFLAG_NODEFRAG) - di_flags |= XFS_DIFLAG_NODEFRAG; - if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { - if (vap->va_xflags & XFS_XFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT) - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { - if (vap->va_xflags & XFS_XFLAG_REALTIME) { - di_flags |= XFS_DIFLAG_REALTIME; - ip->i_iocore.io_flags |= XFS_IOCORE_RT; - } else { - ip->i_iocore.io_flags &= ~XFS_IOCORE_RT; - } - if (vap->va_xflags & XFS_XFLAG_EXTSIZE) - di_flags |= XFS_DIFLAG_EXTSIZE; - } - ip->i_d.di_flags = di_flags; - } - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; - } - - /* - * Change file inode change time only if XFS_AT_CTIME set - * AND we have been called by a DMI function. - */ - - if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) { - ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec; - ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec; - ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_CHG; - } - - /* - * Send out timestamp changes that need to be set to the - * current time. Not done when called by a DMI function. - */ - if (timeflags && !(flags & ATTR_DMI)) - xfs_ichgtime(ip, timeflags); - - XFS_STATS_INC(xs_ig_attrchg); - - /* - * If this is a synchronous mount, make sure that the - * transaction goes to disk before returning to the user. - * This is slightly sub-optimal in that truncates require - * two sync transactions instead of one for wsync filesystems. - * One for the truncate and one for the timestamps since we - * don't want to change the timestamps unless we're sure the - * truncate worked. Truncates are less than 1% of the laddis - * mix so this probably isn't worth the trouble to optimize. - */ - code = 0; - if (tp) { - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); - - code = xfs_trans_commit(tp, commit_flags, NULL); - } - - /* - * If the (regular) file's mandatory locking mode changed, then - * notify the vnode. We do this under the inode lock to prevent - * racing calls to vop_vnode_change. - */ - mandlock_after = MANDLOCK(vp, ip->i_d.di_mode); - if (mandlock_before != mandlock_after) { - bhv_vop_vnode_change(vp, VCHANGE_FLAGS_ENF_LOCKING, - mandlock_after); - } - - xfs_iunlock(ip, lock_flags); - - /* - * Release any dquot(s) the inode had kept before chown. - */ - XFS_QM_DQRELE(mp, olddquot1); - XFS_QM_DQRELE(mp, olddquot2); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - if (code) { - return code; - } - - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_ATTRIBUTE) && - !(flags & ATTR_DMI)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, NULL, NULL, - 0, 0, AT_DELAY_FLAG(flags)); - } - return 0; - - abort_return: - commit_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - error_return: - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - if (tp) { - xfs_trans_cancel(tp, commit_flags); - } - if (lock_flags != 0) { - xfs_iunlock(ip, lock_flags); - } - return code; -} - - -/* - * xfs_access - * Null conversion from vnode mode bits to inode mode bits, as in efs. - */ -STATIC int -xfs_access( - bhv_desc_t *bdp, - int mode, - cred_t *credp) -{ - xfs_inode_t *ip; - int error; - - vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, - (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_iaccess(ip, mode, credp); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - - -/* - * The maximum pathlen is 1024 bytes. Since the minimum file system - * blocksize is 512 bytes, we can get a max of 2 extents back from - * bmapi. - */ -#define SYMLINK_MAPS 2 - -/* - * xfs_readlink - * - */ -STATIC int -xfs_readlink( - bhv_desc_t *bdp, - uio_t *uiop, - int ioflags, - cred_t *credp) -{ - xfs_inode_t *ip; - int count; - xfs_off_t offset; - int pathlen; - bhv_vnode_t *vp; - int error = 0; - xfs_mount_t *mp; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - int byte_cnt; - int n; - xfs_buf_t *bp; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); - - offset = uiop->uio_offset; - count = uiop->uio_resid; - - if (offset < 0) { - error = XFS_ERROR(EINVAL); - goto error_return; - } - if (count <= 0) { - error = 0; - goto error_return; - } - - /* - * See if the symlink is stored inline. - */ - pathlen = (int)ip->i_d.di_size; - - if (ip->i_df.if_flags & XFS_IFINLINE) { - error = uio_read(ip->i_df.if_u1.if_data, pathlen, uiop); - } - else { - /* - * Symlink not inline. Call bmap to get it in. - */ - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi(NULL, ip, 0, XFS_B_TO_FSB(mp, pathlen), - 0, NULL, 0, mval, &nmaps, NULL, NULL); - - if (error) { - goto error_return; - } - - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - error = XFS_BUF_GETERROR(bp); - if (error) { - xfs_ioerror_alert("xfs_readlink", - ip->i_mount, bp, XFS_BUF_ADDR(bp)); - xfs_buf_relse(bp); - goto error_return; - } - if (pathlen < byte_cnt) - byte_cnt = pathlen; - pathlen -= byte_cnt; - - error = uio_read(XFS_BUF_PTR(bp), byte_cnt, uiop); - xfs_buf_relse (bp); - } - - } - -error_return: - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; -} - - -/* - * xfs_fsync - * - * This is called to sync the inode and its data out to disk. - * We need to hold the I/O lock while flushing the data, and - * the inode lock while flushing the inode. The inode lock CANNOT - * be held while flushing the data, so acquire after we're done - * with that. - */ -STATIC int -xfs_fsync( - bhv_desc_t *bdp, - int flag, - cred_t *credp, - xfs_off_t start, - xfs_off_t stop) -{ - xfs_inode_t *ip; - xfs_trans_t *tp; - int error; - int log_flushed = 0, changed = 1; - - vn_trace_entry(BHV_TO_VNODE(bdp), - __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - - ASSERT(start >= 0 && stop >= -1); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return XFS_ERROR(EIO); - - /* - * We always need to make sure that the required inode state - * is safe on disk. The vnode might be clean but because - * of committed transactions that haven't hit the disk yet. - * Likewise, there could be unflushed non-transactional - * changes to the inode core that have to go to disk. - * - * The following code depends on one assumption: that - * any transaction that changes an inode logs the core - * because it has to change some field in the inode core - * (typically nextents or nblocks). That assumption - * implies that any transactions against an inode will - * catch any non-transactional updates. If inode-altering - * transactions exist that violate this assumption, the - * code breaks. Right now, it figures that if the involved - * update_* field is clear and the inode is unpinned, the - * inode is clean. Either it's been flushed or it's been - * committed and the commit has hit the disk unpinning the inode. - * (Note that xfs_inode_item_format() called at commit clears - * the update_* fields.) - */ - xfs_ilock(ip, XFS_ILOCK_SHARED); - - /* If we are flushing data then we care about update_size - * being set, otherwise we care about update_core - */ - if ((flag & FSYNC_DATA) ? - (ip->i_update_size == 0) : - (ip->i_update_core == 0)) { - /* - * Timestamps/size haven't changed since last inode - * flush or inode transaction commit. That means - * either nothing got written or a transaction - * committed which caught the updates. If the - * latter happened and the transaction hasn't - * hit the disk yet, the inode will be still - * be pinned. If it is, force the log. - */ - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (xfs_ipincount(ip)) { - _xfs_log_force(ip->i_mount, (xfs_lsn_t)0, - XFS_LOG_FORCE | - ((flag & FSYNC_WAIT) - ? XFS_LOG_SYNC : 0), - &log_flushed); - } else { - /* - * If the inode is not pinned and nothing - * has changed we don't need to flush the - * cache. - */ - changed = 0; - } - error = 0; - } else { - /* - * Kick off a transaction to log the inode - * core to get the updates. Make it - * sync if FSYNC_WAIT is passed in (which - * is done by everybody but specfs). The - * sync transaction will also force the log. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS); - if ((error = xfs_trans_reserve(tp, 0, - XFS_FSYNC_TS_LOG_RES(ip->i_mount), - 0, 0, 0))) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* - * Note - it's possible that we might have pushed - * ourselves out of the way during trans_reserve - * which would flush the inode. But there's no - * guarantee that the inode buffer has actually - * gone out yet (it's delwri). Plus the buffer - * could be pinned anyway if it's part of an - * inode in another recent transaction. So we - * play it safe and fire off the transaction anyway. - */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (flag & FSYNC_WAIT) - xfs_trans_set_sync(tp); - error = _xfs_trans_commit(tp, 0, NULL, &log_flushed); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - if ((ip->i_mount->m_flags & XFS_MOUNT_BARRIER) && changed) { - /* - * If the log write didn't issue an ordered tag we need - * to flush the disk cache for the data device now. - */ - if (!log_flushed) - xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp); - - /* - * If this inode is on the RT dev we need to flush that - * cache as well. - */ - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) - xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); - } - - return error; -} - -/* - * This is called by xfs_inactive to free any blocks beyond eof, - * when the link count isn't zero. - */ -STATIC int -xfs_inactive_free_eofblocks( - xfs_mount_t *mp, - xfs_inode_t *ip) -{ - xfs_trans_t *tp; - int error; - xfs_fileoff_t end_fsb; - xfs_fileoff_t last_fsb; - xfs_filblks_t map_len; - int nimaps; - xfs_bmbt_irec_t imap; - - /* - * Figure out if there are any blocks beyond the end - * of the file. If not, then there is nothing to do. - */ - end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_d.di_size)); - last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); - map_len = last_fsb - end_fsb; - if (map_len <= 0) - return 0; - - nimaps = 1; - xfs_ilock(ip, XFS_ILOCK_SHARED); - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, end_fsb, map_len, 0, - NULL, 0, &imap, &nimaps, NULL, NULL); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (!error && (nimaps != 0) && - (imap.br_startblock != HOLESTARTBLOCK || - ip->i_delayed_blks)) { - /* - * Attach the dquots to the inode up front. - */ - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) - return error; - - /* - * There are blocks after the end of file. - * Free them up now by truncating the file to - * its current size. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - - /* - * Do the xfs_itruncate_start() call before - * reserving any log space because - * itruncate_start will call into the buffer - * cache and we can't - * do that within a transaction. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, - ip->i_d.di_size); - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, - XFS_IOLOCK_EXCL | - XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - error = xfs_itruncate_finish(&tp, ip, - ip->i_d.di_size, - XFS_DATA_FORK, - 0); - /* - * If we get an error at this point we - * simply don't bother truncating the file. - */ - if (error) { - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES, - NULL); - } - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - } - return error; -} - -/* - * Free a symlink that has blocks associated with it. - */ -STATIC int -xfs_inactive_symlink_rmt( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_buf_t *bp; - int committed; - int done; - int error; - xfs_fsblock_t first_block; - xfs_bmap_free_t free_list; - int i; - xfs_mount_t *mp; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - int nmaps; - xfs_trans_t *ntp; - int size; - xfs_trans_t *tp; - - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_size > XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink that has some - * blocks allocated to it. Free the - * blocks here. We know that we've got - * either 1 or 2 extents and that we can - * free them all in one bunmapi call. - */ - ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2); - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - return error; - } - /* - * Lock the inode, fix the size, and join it to the transaction. - * Hold it so in the normal path, we still have it locked for - * the second transaction. In the error paths we need it - * held so the cancel won't rele it, see below. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - size = (int)ip->i_d.di_size; - ip->i_d.di_size = 0; - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Find the block(s) so we can inval and unmap them. - */ - done = 0; - XFS_BMAP_INIT(&free_list, &first_block); - nmaps = ARRAY_SIZE(mval); - if ((error = xfs_bmapi(tp, ip, 0, XFS_B_TO_FSB(mp, size), - XFS_BMAPI_METADATA, &first_block, 0, mval, &nmaps, - &free_list, NULL))) - goto error0; - /* - * Invalidate the block(s). - */ - for (i = 0; i < nmaps; i++) { - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), - XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); - xfs_trans_binval(tp, bp); - } - /* - * Unmap the dead block(s) to the free_list. - */ - if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps, - &first_block, &free_list, NULL, &done))) - goto error1; - ASSERT(done); - /* - * Commit the first transaction. This logs the EFI and the inode. - */ - if ((error = xfs_bmap_finish(&tp, &free_list, first_block, &committed))) - goto error1; - /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - /* - * Get a new, empty transaction to return to our caller. - */ - ntp = xfs_trans_dup(tp); - /* - * Commit the transaction containing extent freeing and EFDs. - * If we get an error on the commit here or on the reserve below, - * we need to unlock the inode since the new transaction doesn't - * have the inode attached. - */ - error = xfs_trans_commit(tp, 0, NULL); - tp = ntp; - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * Remove the memory for extent descriptions (just bookkeeping). - */ - if (ip->i_df.if_bytes) - xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - /* - * Put an itruncate log reservation in the new transaction - * for our caller. - */ - if ((error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_ITRUNCATE_LOG_COUNT))) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - goto error0; - } - /* - * Return with the inode locked but not joined to the transaction. - */ - *tpp = tp; - return 0; - - error1: - xfs_bmap_cancel(&free_list); - error0: - /* - * Have to come here with the inode locked and either - * (held and in the transaction) or (not in the transaction). - * If the inode isn't held then cancel would iput it, but - * that's wrong since this is inactive and the vnode ref - * count is 0 already. - * Cancel won't do anything to the inode if held, but it still - * needs to be locked until the cancel is done, if it was - * joined to the transaction. - */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - *tpp = NULL; - return error; - -} - -STATIC int -xfs_inactive_symlink_local( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - int error; - - ASSERT(ip->i_d.di_size <= XFS_IFORK_DSIZE(ip)); - /* - * We're freeing a symlink which fit into - * the inode. Just free the memory used - * to hold the old symlink. - */ - error = xfs_trans_reserve(*tpp, 0, - XFS_ITRUNCATE_LOG_RES(ip->i_mount), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - - if (error) { - xfs_trans_cancel(*tpp, 0); - *tpp = NULL; - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - - /* - * Zero length symlinks _can_ exist. - */ - if (ip->i_df.if_bytes > 0) { - xfs_idata_realloc(ip, - -(ip->i_df.if_bytes), - XFS_DATA_FORK); - ASSERT(ip->i_df.if_bytes == 0); - } - return 0; -} - -STATIC int -xfs_inactive_attrs( - xfs_inode_t *ip, - xfs_trans_t **tpp) -{ - xfs_trans_t *tp; - int error; - xfs_mount_t *mp; - - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); - tp = *tpp; - mp = ip->i_mount; - ASSERT(ip->i_d.di_forkoff != 0); - xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_attr_inactive(ip); - if (error) { - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; /* goto out */ - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - *tpp = NULL; - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - - ASSERT(ip->i_d.di_anextents == 0); - - *tpp = tp; - return 0; -} - -STATIC int -xfs_release( - bhv_desc_t *bdp) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - xfs_mount_t *mp; - int error; - - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) - return 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) - return 0; - -#ifdef HAVE_REFCACHE - /* If we are in the NFS reference cache then don't do this now */ - if (ip->i_refcache) - return 0; -#endif - - if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS)) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) - return error; - /* Update linux inode block count after free above */ - vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, - ip->i_d.di_nblocks + ip->i_delayed_blks); - } - } - - return 0; -} - -/* - * xfs_inactive - * - * This is called when the vnode reference count for the vnode - * goes to zero. If the file has been unlinked, then it must - * now be truncated. Also, we clear all of the read-ahead state - * kept for the inode here since the file is now closed. - */ -STATIC int -xfs_inactive( - bhv_desc_t *bdp, - cred_t *credp) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int committed; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - int truncate; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - - /* - * If the inode is already free, then there can be nothing - * to clean up here. - */ - if (ip->i_d.di_mode == 0 || VN_BAD(vp)) { - ASSERT(ip->i_df.if_real_bytes == 0); - ASSERT(ip->i_df.if_broot_bytes == 0); - return VN_INACTIVE_CACHE; - } - - /* - * Only do a truncate if it's a regular file with - * some actual space in it. It's OK to look at the - * inode's fields without the lock because we're the - * only one with a reference to the inode. - */ - truncate = ((ip->i_d.di_nlink == 0) && - ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0) || - (ip->i_delayed_blks > 0)) && - ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); - - mp = ip->i_mount; - - if (ip->i_d.di_nlink == 0 && - DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_DESTROY)) { - (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL); - } - - error = 0; - - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (vp->v_vfsp->vfs_flag & VFS_RDONLY) - goto out; - - if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && - ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0 || - ip->i_delayed_blks > 0)) && - (ip->i_df.if_flags & XFS_IFEXTENTS) && - (!(ip->i_d.di_flags & - (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || - (ip->i_delayed_blks != 0)))) { - if ((error = xfs_inactive_free_eofblocks(mp, ip))) - return VN_INACTIVE_CACHE; - /* Update linux inode block count after free above */ - vn_to_inode(vp)->i_blocks = XFS_FSB_TO_BB(mp, - ip->i_d.di_nblocks + ip->i_delayed_blks); - } - goto out; - } - - ASSERT(ip->i_d.di_nlink == 0); - - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) - return VN_INACTIVE_CACHE; - - tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); - if (truncate) { - /* - * Do the xfs_itruncate_start() call before - * reserving any log space because itruncate_start - * will call into the buffer cache and we can't - * do that within a transaction. - */ - xfs_ilock(ip, XFS_IOLOCK_EXCL); - - xfs_itruncate_start(ip, XFS_ITRUNC_DEFINITE, 0); - - error = xfs_trans_reserve(tp, 0, - XFS_ITRUNCATE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_ITRUNCATE_LOG_COUNT); - if (error) { - /* Don't call itruncate_cleanup */ - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - /* - * normally, we have to run xfs_itruncate_finish sync. - * But if filesystem is wsync and we're in the inactive - * path, then we know that nlink == 0, and that the - * xaction that made nlink == 0 is permanently committed - * since xfs_remove runs as a synchronous transaction. - */ - error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, - (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0)); - - if (error) { - xfs_trans_cancel(tp, - XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - return VN_INACTIVE_CACHE; - } - } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { - - /* - * If we get an error while cleaning up a - * symlink we bail out. - */ - error = (ip->i_d.di_size > XFS_IFORK_DSIZE(ip)) ? - xfs_inactive_symlink_rmt(ip, &tp) : - xfs_inactive_symlink_local(ip, &tp); - - if (error) { - ASSERT(tp == NULL); - return VN_INACTIVE_CACHE; - } - - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - } else { - error = xfs_trans_reserve(tp, 0, - XFS_IFREE_LOG_RES(mp), - 0, XFS_TRANS_PERM_LOG_RES, - XFS_INACTIVE_LOG_COUNT); - if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - return VN_INACTIVE_CACHE; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - } - - /* - * If there are attributes associated with the file - * then blow them away now. The code calls a routine - * that recursively deconstructs the attribute fork. - * We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). - */ - if (ip->i_d.di_anextents > 0) { - error = xfs_inactive_attrs(ip, &tp); - /* - * If we got an error, the transaction is already - * cancelled, and the inode is unlocked. Just get out. - */ - if (error) - return VN_INACTIVE_CACHE; - } else if (ip->i_afp) { - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - } - - /* - * Free the inode. - */ - XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_ifree(tp, ip, &free_list); - if (error) { - /* - * If we fail to free the inode, shut down. The cancel - * might do that, we need to make sure. Otherwise the - * inode might be lost for a long time or forever. - */ - if (!XFS_FORCED_SHUTDOWN(mp)) { - cmn_err(CE_NOTE, - "xfs_inactive: xfs_ifree() returned an error = %d on %s", - error, mp->m_fsname); - xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); - } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } else { - /* - * Credit the quota account(s). The inode is gone. - */ - XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); - - /* - * Just ignore errors at this point. There is - * nothing we can do except to try to keep going. - */ - (void) xfs_bmap_finish(&tp, &free_list, first_block, - &committed); - (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - } - /* - * Release the dquots held by inode, if any. - */ - XFS_QM_DQDETACH(mp, ip); - - xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); - - out: - return VN_INACTIVE_CACHE; -} - - -/* - * xfs_lookup - */ -STATIC int -xfs_lookup( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vnode_t **vpp, - int flags, - bhv_vnode_t *rdir, - cred_t *credp) -{ - xfs_inode_t *dp, *ip; - xfs_ino_t e_inum; - int error; - uint lock_mode; - bhv_vnode_t *dir_vp; - - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup_int(dir_bdp, lock_mode, dentry, &e_inum, &ip); - if (!error) { - *vpp = XFS_ITOV(ip); - ITRACE(ip); - } - xfs_iunlock_map_shared(dp, lock_mode); - return error; -} - - -/* - * xfs_create (create a new file). - */ -STATIC int -xfs_create( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - bhv_vnode_t **vpp, - cred_t *credp) -{ - char *name = VNAME(dentry); - bhv_vnode_t *dir_vp; - xfs_inode_t *dp, *ip; - bhv_vnode_t *vp = NULL; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_dev_t rdev; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t dp_joined_to_trans; - int dm_event_sent = 0; - uint cancel_flags; - int committed; - xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - int dm_di_mode; - int namelen; - - ASSERT(!*vpp); - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - dm_di_mode = vap->va_mode; - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, name, NULL, - dm_di_mode, 0, 0); - - if (error) - return error; - dm_event_sent = 1; - } - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* Return through std_return after this point. */ - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; - else - prid = (xfs_prid_t)dfltprid; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, - XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - ip = NULL; - dp_joined_to_trans = B_FALSE; - - tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_CREATE_SPACE_RES(mp, namelen); - /* - * Initially assume that the file does not exist and - * reserve the resources for that case. If that is not - * the case we'll drop the one we have and get a more - * appropriate transaction later. - */ - error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_CREATE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - dp = NULL; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL); - - XFS_BMAP_INIT(&free_list, &first_block); - - ASSERT(ip == NULL); - - /* - * Reserve disk quota and the inode. - */ - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen))) - goto error_return; - rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0; - error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1, - rdev, credp, prid, resblks > 0, - &ip, &committed); - if (error) { - if (error == ENOSPC) - goto error_return; - goto abort_return; - } - ITRACE(ip); - - /* - * At this point, we've gotten a newly allocated inode. - * It is locked (and joined to the transaction). - */ - - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); - - /* - * Now we join the directory inode to the transaction. - * We do not do it earlier because xfs_dir_ialloc - * might commit the previous transaction (and release - * all the locks). - */ - - VN_HOLD(dir_vp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; - - error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto abort_return; - } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * If this is a synchronous mount, make sure that the - * create transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - dp->i_gen++; - - /* - * Attach the dquot(s) to the inodes and modify them incore. - * These ids of the inode couldn't have changed since the new - * inode has been locked ever since it was created. - */ - XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); - - /* - * xfs_trans_commit normally decrements the vnode ref count - * when it unlocks the inode. Since we want to return the - * vnode to the caller, we bump the vnode ref count now. - */ - IHOLD(ip); - vp = XFS_ITOV(ip); - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_rele; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - IRELE(ip); - tp = NULL; - goto error_return; - } - - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - /* - * Propagate the fact that the vnode changed after the - * xfs_inode locks have been released. - */ - bhv_vop_vnode_change(vp, VCHANGE_FLAGS_TRUNCATED, 3); - - *vpp = vp; - - /* Fallthrough to std_return with error = 0 */ - -std_return: - if ( (*vpp || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - *vpp ? vp:NULL, - DM_RIGHT_NULL, name, NULL, - dm_di_mode, error, 0); - } - return error; - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - if (tp != NULL) - xfs_trans_cancel(tp, cancel_flags); - - if (!dp_joined_to_trans && (dp != NULL)) - xfs_iunlock(dp, XFS_ILOCK_EXCL); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - goto std_return; - - abort_rele: - /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. - */ - cancel_flags |= XFS_TRANS_ABORT; - xfs_trans_cancel(tp, cancel_flags); - IRELE(ip); - - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - goto std_return; -} - -#ifdef DEBUG -/* - * Some counters to see if (and how often) we are hitting some deadlock - * prevention code paths. - */ - -int xfs_rm_locks; -int xfs_rm_lock_delays; -int xfs_rm_attempts; -#endif - -/* - * The following routine will lock the inodes associated with the - * directory and the named entry in the directory. The locks are - * acquired in increasing inode number. - * - * If the entry is "..", then only the directory is locked. The - * vnode ref count will still include that from the .. entry in - * this case. - * - * There is a deadlock we need to worry about. If the locked directory is - * in the AIL, it might be blocking up the log. The next inode we lock - * could be already locked by another thread waiting for log space (e.g - * a permanent log reservation with a long running transaction (see - * xfs_itruncate_finish)). To solve this, we must check if the directory - * is in the ail and use lock_nowait. If we can't lock, we need to - * drop the inode lock on the directory and try again. xfs_iunlock will - * potentially push the tail if we were holding up the log. - */ -STATIC int -xfs_lock_dir_and_entry( - xfs_inode_t *dp, - bhv_vname_t *dentry, - xfs_inode_t *ip) /* inode of entry 'name' */ -{ - int attempts; - xfs_ino_t e_inum; - xfs_inode_t *ips[2]; - xfs_log_item_t *lp; - -#ifdef DEBUG - xfs_rm_locks++; -#endif - attempts = 0; - -again: - xfs_ilock(dp, XFS_ILOCK_EXCL); - - e_inum = ip->i_ino; - - ITRACE(ip); - - /* - * We want to lock in increasing inum. Since we've already - * acquired the lock on the directory, we may need to release - * if if the inum of the entry turns out to be less. - */ - if (e_inum > dp->i_ino) { - /* - * We are already in the right order, so just - * lock on the inode of the entry. - * We need to use nowait if dp is in the AIL. - */ - - lp = (xfs_log_item_t *)dp->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - attempts++; -#ifdef DEBUG - xfs_rm_attempts++; -#endif - - /* - * Unlock dp and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_rm_lock_delays++; -#endif - } - goto again; - } - } else { - xfs_ilock(ip, XFS_ILOCK_EXCL); - } - } else if (e_inum < dp->i_ino) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - - ips[0] = ip; - ips[1] = dp; - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); - } - /* else e_inum == dp->i_ino */ - /* This can happen if we're asked to lock /x/.. - * the entry is "..", which is also the parent directory. - */ - - return 0; -} - -#ifdef DEBUG -int xfs_locked_n; -int xfs_small_retries; -int xfs_middle_retries; -int xfs_lots_retries; -int xfs_lock_delays; -#endif - -/* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. - * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. - */ -void -xfs_lock_inodes( - xfs_inode_t **ips, - int inodes, - int first_locked, - uint lock_mode) -{ - int attempts = 0, i, j, try_lock; - xfs_log_item_t *lp; - - ASSERT(ips && (inodes >= 2)); /* we need at least two */ - - if (first_locked) { - try_lock = 1; - i = 1; - } else { - try_lock = 0; - i = 0; - } - -again: - for (; i < inodes; i++) { - ASSERT(ips[i]); - - if (i && (ips[i] == ips[i-1])) /* Already locked */ - continue; - - /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. - */ - - if (!try_lock) { - for (j = (i - 1); j >= 0 && !try_lock; j--) { - lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { - try_lock++; - } - } - } - - /* - * If any of the previous locks we have locked is in the AIL, - * we must TRY to get the second and subsequent locks. If - * we can't get any, we must release all we have - * and try again. - */ - - if (try_lock) { - /* try_lock must be 0 if i is 0. */ - /* - * try_lock means we have an inode locked - * that is in the AIL. - */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], lock_mode)) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } - - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ -#ifdef DEBUG - xfs_lock_delays++; -#endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], lock_mode); - } - } - -#ifdef DEBUG - if (attempts) { - if (attempts < 5) xfs_small_retries++; - else if (attempts < 100) xfs_middle_retries++; - else xfs_lots_retries++; - } else { - xfs_locked_n++; - } -#endif -} - -#ifdef DEBUG -#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);} -int remove_which_error_return = 0; -#else /* ! DEBUG */ -#define REMOVE_DEBUG_TRACE(x) -#endif /* ! DEBUG */ - - -/* - * xfs_remove - * - */ -STATIC int -xfs_remove( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - cred_t *credp) -{ - bhv_vnode_t *dir_vp; - char *name = VNAME(dentry); - xfs_inode_t *dp, *ip; - xfs_trans_t *tp = NULL; - xfs_mount_t *mp; - int error = 0; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - int dm_di_mode = 0; - int link_zero; - uint resblks; - int namelen; - - dir_vp = BHV_TO_VNODE(dir_bdp); - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); - if (error) - return error; - } - - /* From this point on, return through std_return */ - ip = NULL; - - /* - * We need to get a reference to ip before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - error = xfs_get_dir_entry(dentry, &ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - - dm_di_mode = ip->i_d.di_mode; - - vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); - - ITRACE(ip); - - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != ip) - error = XFS_QM_DQATTACH(mp, ip, 0); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - IRELE(ip); - goto std_return; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT); - } - if (error) { - ASSERT(error != ENOSPC); - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, 0); - IRELE(ip); - return error; - } - - error = xfs_lock_dir_and_entry(dp, dentry, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - xfs_trans_cancel(tp, cancel_flags); - IRELE(ip); - goto std_return; - } - - /* - * At this point, we've gotten both the directory and the entry - * inodes locked. - */ - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != ip) { - /* - * Increment vnode ref count only in this case since - * there's an extra vnode reference in the case where - * dp == ip. - */ - IHOLD(dp); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - } - - /* - * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. - */ - XFS_BMAP_INIT(&free_list, &first_block); - error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino, - &first_block, &free_list, 0); - if (error) { - ASSERT(error != ENOENT); - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; - } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - dp->i_gen++; - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - error = xfs_droplink(tp, ip); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error1; - } - - /* Determine if this is the last link while - * we are in the transaction. - */ - link_zero = (ip)->i_d.di_nlink==0; - - /* - * Take an extra ref on the inode so that it doesn't - * go to xfs_inactive() from within the commit. - */ - IHOLD(ip); - - /* - * If this is a synchronous mount, make sure that the - * remove transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto error_rele; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - IRELE(ip); - goto std_return; - } - - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - - vn_trace_exit(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); - - /* - * Let interposed file systems know about removed links. - */ - bhv_vop_link_removed(XFS_ITOV(ip), dir_vp, link_zero); - - IRELE(ip); - -/* Fall through to std_return with error = 0 */ - std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, - DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, error, 0); - } - return error; - - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - - error_rele: - /* - * In this case make sure to not release the inode until after - * the current transaction is aborted. Releasing it beforehand - * can cause us to go to xfs_inactive and start a recursive - * transaction which can easily deadlock with the current one. - */ - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - xfs_trans_cancel(tp, cancel_flags); - - /* - * Before we drop our extra reference to the inode, purge it - * from the refcache if it is there. By waiting until afterwards - * to do the IRELE, we ensure that we won't go inactive in the - * xfs_refcache_purge_ip routine (although that would be OK). - */ - xfs_refcache_purge_ip(ip); - - IRELE(ip); - - goto std_return; -} - - -/* - * xfs_link - * - */ -STATIC int -xfs_link( - bhv_desc_t *target_dir_bdp, - bhv_vnode_t *src_vp, - bhv_vname_t *dentry, - cred_t *credp) -{ - xfs_inode_t *tdp, *sip; - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_inode_t *ips[2]; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - bhv_vnode_t *target_dir_vp; - int resblks; - char *target_name = VNAME(dentry); - int target_namelen; - - target_dir_vp = BHV_TO_VNODE(target_dir_bdp); - vn_trace_entry(target_dir_vp, __FUNCTION__, (inst_t *)__return_address); - vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address); - - target_namelen = VNAMELEN(dentry); - ASSERT(!VN_ISDIR(src_vp)); - - sip = xfs_vtoi(src_vp); - tdp = XFS_BHVTOI(target_dir_bdp); - mp = tdp->i_mount; - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (DM_EVENT_ENABLED(src_vp->v_vfsp, tdp, DM_EVENT_LINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - - error = XFS_QM_DQATTACH(mp, sip, 0); - if (!error && sip != tdp) - error = XFS_QM_DQATTACH(mp, tdp, 0); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_LINK_SPACE_RES(mp, target_namelen); - error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_LINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - goto error_return; - } - - if (sip->i_ino < tdp->i_ino) { - ips[0] = sip; - ips[1] = tdp; - } else { - ips[0] = tdp; - ips[1] = sip; - } - - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); - - /* - * Increment vnode ref counts since xfs_trans_commit & - * xfs_trans_cancel will both unlock the inodes and - * decrement the associated ref counts. - */ - VN_HOLD(src_vp); - VN_HOLD(target_dir_vp); - xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); - - /* - * If the source has too many links, we can't make any more to it. - */ - if (sip->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - goto error_return; - } - - /* - * If we are using project inheritance, we only allow hard link - * creation in our tree when the project IDs are the same; else - * the tree quota mechanism could be circumvented. - */ - if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (tdp->i_d.di_projid != sip->i_d.di_projid))) { - error = XFS_ERROR(EXDEV); - goto error_return; - } - - if (resblks == 0 && - (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen))) - goto error_return; - - XFS_BMAP_INIT(&free_list, &first_block); - - error = xfs_dir_createname(tp, tdp, target_name, target_namelen, - sip->i_ino, &first_block, &free_list, - resblks); - if (error) - goto abort_return; - xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - tdp->i_gen++; - xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); - - error = xfs_bumplink(tp, sip); - if (error) - goto abort_return; - - /* - * If this is a synchronous mount, make sure that the - * link transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - goto abort_return; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) - goto std_return; - - /* Fall through to std_return with error = 0. */ -std_return: - if (DM_EVENT_ENABLED(src_vp->v_vfsp, sip, - DM_EVENT_POSTLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, - target_dir_vp, DM_RIGHT_NULL, - src_vp, DM_RIGHT_NULL, - target_name, NULL, 0, error, 0); - } - return error; - - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - xfs_trans_cancel(tp, cancel_flags); - goto std_return; -} - - -/* - * xfs_mkdir - * - */ -STATIC int -xfs_mkdir( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - bhv_vnode_t **vpp, - cred_t *credp) -{ - char *dir_name = VNAME(dentry); - xfs_inode_t *dp; - xfs_inode_t *cdp; /* inode of created dir */ - bhv_vnode_t *cvp; /* vnode of created dir */ - xfs_trans_t *tp; - xfs_mount_t *mp; - int cancel_flags; - int error; - int committed; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - bhv_vnode_t *dir_vp; - boolean_t dp_joined_to_trans; - boolean_t created = B_FALSE; - int dm_event_sent = 0; - xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - int dm_di_mode; - int dir_namelen; - - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - dir_namelen = VNAMELEN(dentry); - - tp = NULL; - dp_joined_to_trans = B_FALSE; - dm_di_mode = vap->va_mode; - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, - dir_vp, DM_RIGHT_NULL, NULL, - DM_RIGHT_NULL, dir_name, NULL, - dm_di_mode, 0, 0); - if (error) - return error; - dm_event_sent = 1; - } - - /* Return through std_return after this point. */ - - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - mp = dp->i_mount; - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; - else - prid = (xfs_prid_t)dfltprid; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen); - error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_MKDIR_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, - XFS_MKDIR_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - dp = NULL; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL); - - /* - * Check for directory link count overflow. - */ - if (dp->i_d.di_nlink >= XFS_MAXLINK) { - error = XFS_ERROR(EMLINK); - goto error_return; - } - - /* - * Reserve disk quota and the inode. - */ - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen))) - goto error_return; - /* - * create the directory inode. - */ - error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2, - 0, credp, prid, resblks > 0, - &cdp, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto abort_return; - } - ITRACE(cdp); - - /* - * Now we add the directory inode to the transaction. - * We waited until now since xfs_dir_ialloc might start - * a new transaction. Had we joined the transaction - * earlier, the locks might have gotten released. - */ - VN_HOLD(dir_vp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; - - XFS_BMAP_INIT(&free_list, &first_block); - - error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino, - &first_block, &free_list, resblks ? - resblks - XFS_IALLOC_SPACE_RES(mp) : 0); - if (error) { - ASSERT(error != ENOSPC); - goto error1; - } - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Bump the in memory version number of the parent directory - * so that other processes accessing it will recognize that - * the directory has changed. - */ - dp->i_gen++; - - error = xfs_dir_init(tp, cdp, dp); - if (error) - goto error2; - - cdp->i_gen = 1; - error = xfs_bumplink(tp, dp); - if (error) - goto error2; - - cvp = XFS_ITOV(cdp); - - created = B_TRUE; - - *vpp = cvp; - IHOLD(cdp); - - /* - * Attach the dquots to the new inode and modify the icount incore. - */ - XFS_QM_DQVOPCREATE(mp, tp, cdp, udqp, gdqp); - - /* - * If this is a synchronous mount, make sure that the - * mkdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - IRELE(cdp); - goto error2; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - if (error) { - IRELE(cdp); - } - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit. */ - -std_return: - if ( (created || (error != 0 && dm_event_sent != 0)) && - DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTCREATE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, - dir_vp, DM_RIGHT_NULL, - created ? XFS_ITOV(cdp):NULL, - DM_RIGHT_NULL, - dir_name, NULL, - dm_di_mode, error, 0); - } - return error; - - error2: - error1: - xfs_bmap_cancel(&free_list); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - if (!dp_joined_to_trans && (dp != NULL)) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - } - - goto std_return; -} - - -/* - * xfs_rmdir - * - */ -STATIC int -xfs_rmdir( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - cred_t *credp) -{ - char *name = VNAME(dentry); - xfs_inode_t *dp; - xfs_inode_t *cdp; /* child directory */ - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - bhv_vnode_t *dir_vp; - int dm_di_mode = 0; - int last_cdp_link; - int namelen; - uint resblks; - - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - mp = dp->i_mount; - - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - if (XFS_FORCED_SHUTDOWN(XFS_BHVTOI(dir_bdp)->i_mount)) - return XFS_ERROR(EIO); - namelen = VNAMELEN(dentry); - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_REMOVE)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, 0, 0, 0); - if (error) - return XFS_ERROR(error); - } - - /* Return through std_return after this point. */ - - cdp = NULL; - - /* - * We need to get a reference to cdp before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - error = xfs_get_dir_entry(dentry, &cdp); - if (error) { - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - mp = dp->i_mount; - dm_di_mode = cdp->i_d.di_mode; - - /* - * Get the dquots for the inodes. - */ - error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != cdp) - error = XFS_QM_DQATTACH(mp, cdp, 0); - if (error) { - IRELE(cdp); - REMOVE_DEBUG_TRACE(__LINE__); - goto std_return; - } - - tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * We try to get the real space reservation first, - * allowing for directory btree deletion(s) implying - * possible bmap insert(s). If we can't get the space - * reservation then we use 0 instead, and avoid the bmap - * btree insert(s) in the directory code by, if the bmap - * insert tries to happen, instead trimming the LAST - * block from the directory. - */ - resblks = XFS_REMOVE_SPACE_RES(mp); - error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - if (error == ENOSPC) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT); - } - if (error) { - ASSERT(error != ENOSPC); - cancel_flags = 0; - IRELE(cdp); - goto error_return; - } - XFS_BMAP_INIT(&free_list, &first_block); - - /* - * Now lock the child directory inode and the parent directory - * inode in the proper order. This will take care of validating - * that the directory entry for the child directory inode has - * not changed while we were obtaining a log reservation. - */ - error = xfs_lock_dir_and_entry(dp, dentry, cdp); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - IRELE(cdp); - goto std_return; - } - - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != cdp) { - /* - * Only increment the parent directory vnode count if - * we didn't bump it in looking up cdp. The only time - * we don't bump it is when we're looking up ".". - */ - VN_HOLD(dir_vp); - } - - ITRACE(cdp); - xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL); - - ASSERT(cdp->i_d.di_nlink >= 2); - if (cdp->i_d.di_nlink != 2) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - if (!xfs_dir_isempty(cdp)) { - error = XFS_ERROR(ENOTEMPTY); - goto error_return; - } - - error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error1; - - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - - /* - * Bump the in memory generation count on the parent - * directory so that other can know that it has changed. - */ - dp->i_gen++; - - /* - * Drop the link from cdp's "..". - */ - error = xfs_droplink(tp, dp); - if (error) { - goto error1; - } - - /* - * Drop the link from dp to cdp. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* - * Drop the "." link from cdp to self. - */ - error = xfs_droplink(tp, cdp); - if (error) { - goto error1; - } - - /* Determine these before committing transaction */ - last_cdp_link = (cdp)->i_d.di_nlink==0; - - /* - * Take an extra ref on the child vnode so that it - * does not go to xfs_inactive() from within the commit. - */ - IHOLD(cdp); - - /* - * If this is a synchronous mount, make sure that the - * rmdir transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - error = xfs_bmap_finish (&tp, &free_list, first_block, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - IRELE(cdp); - goto std_return; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { - IRELE(cdp); - goto std_return; - } - - - /* - * Let interposed file systems know about removed links. - */ - bhv_vop_link_removed(XFS_ITOV(cdp), dir_vp, last_cdp_link); - - IRELE(cdp); - - /* Fall through to std_return with error = 0 or the errno - * from xfs_trans_commit. */ - std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_POSTREMOVE)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, - dir_vp, DM_RIGHT_NULL, - NULL, DM_RIGHT_NULL, - name, NULL, dm_di_mode, - error, 0); - } - return error; - - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - /* FALLTHROUGH */ - - error_return: - xfs_trans_cancel(tp, cancel_flags); - goto std_return; -} - - -/* - * Read dp's entries starting at uiop->uio_offset and translate them into - * bufsize bytes worth of struct dirents starting at bufbase. - */ -STATIC int -xfs_readdir( - bhv_desc_t *dir_bdp, - uio_t *uiop, - cred_t *credp, - int *eofp) -{ - xfs_inode_t *dp; - xfs_trans_t *tp = NULL; - int error = 0; - uint lock_mode; - - vn_trace_entry(BHV_TO_VNODE(dir_bdp), __FUNCTION__, - (inst_t *)__return_address); - dp = XFS_BHVTOI(dir_bdp); - - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) - return XFS_ERROR(EIO); - - lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_getdents(tp, dp, uiop, eofp); - xfs_iunlock_map_shared(dp, lock_mode); - return error; -} - - -STATIC int -xfs_symlink( - bhv_desc_t *dir_bdp, - bhv_vname_t *dentry, - bhv_vattr_t *vap, - char *target_path, - bhv_vnode_t **vpp, - cred_t *credp) -{ - xfs_trans_t *tp; - xfs_mount_t *mp; - xfs_inode_t *dp; - xfs_inode_t *ip; - int error; - int pathlen; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - boolean_t dp_joined_to_trans; - bhv_vnode_t *dir_vp; - uint cancel_flags; - int committed; - xfs_fileoff_t first_fsb; - xfs_filblks_t fs_blocks; - int nmaps; - xfs_bmbt_irec_t mval[SYMLINK_MAPS]; - xfs_daddr_t d; - char *cur_chunk; - int byte_cnt; - int n; - xfs_buf_t *bp; - xfs_prid_t prid; - struct xfs_dquot *udqp, *gdqp; - uint resblks; - char *link_name = VNAME(dentry); - int link_namelen; - - *vpp = NULL; - dir_vp = BHV_TO_VNODE(dir_bdp); - dp = XFS_BHVTOI(dir_bdp); - dp_joined_to_trans = B_FALSE; - error = 0; - ip = NULL; - tp = NULL; - - vn_trace_entry(dir_vp, __FUNCTION__, (inst_t *)__return_address); - - mp = dp->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - link_namelen = VNAMELEN(dentry); - - /* - * Check component lengths of the target path name. - */ - pathlen = strlen(target_path); - if (pathlen >= MAXPATHLEN) /* total string too long */ - return XFS_ERROR(ENAMETOOLONG); - if (pathlen >= MAXNAMELEN) { /* is any component too long? */ - int len, total; - char *path; - - for (total = 0, path = target_path; total < pathlen;) { - /* - * Skip any slashes. - */ - while(*path == '/') { - total++; - path++; - } - - /* - * Count up to the next slash or end of path. - * Error out if the component is bigger than MAXNAMELEN. - */ - for(len = 0; *path != '/' && total < pathlen;total++, path++) { - if (++len >= MAXNAMELEN) { - error = ENAMETOOLONG; - return error; - } - } - } - } - - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_SYMLINK)) { - error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp, - DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, - link_name, target_path, 0, 0, 0); - if (error) - return error; - } - - /* Return through std_return after this point. */ - - udqp = gdqp = NULL; - if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - prid = dp->i_d.di_projid; - else if (vap->va_mask & XFS_AT_PROJID) - prid = (xfs_prid_t)vap->va_projid; - else - prid = (xfs_prid_t)dfltprid; - - /* - * Make sure that we have allocated dquot(s) on disk. - */ - error = XFS_QM_DQVOPALLOC(mp, dp, - current_fsuid(credp), current_fsgid(credp), prid, - XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); - if (error) - goto std_return; - - tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* - * The symlink will fit into the inode data fork? - * There can't be any attributes so we get the whole variable part. - */ - if (pathlen <= XFS_LITINO(mp)) - fs_blocks = 0; - else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); - resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks); - error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - if (error == ENOSPC && fs_blocks == 0) { - resblks = 0; - error = xfs_trans_reserve(tp, 0, XFS_SYMLINK_LOG_RES(mp), 0, - XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); - } - if (error) { - cancel_flags = 0; - dp = NULL; - goto error_return; - } - - xfs_ilock(dp, XFS_ILOCK_EXCL); - - /* - * Check whether the directory allows new symlinks or not. - */ - if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { - error = XFS_ERROR(EPERM); - goto error_return; - } - - /* - * Reserve disk quota : blocks and inode. - */ - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, udqp, gdqp, resblks, 1, 0); - if (error) - goto error_return; - - /* - * Check for ability to enter directory entry, if no space reserved. - */ - if (resblks == 0 && - (error = xfs_dir_canenter(tp, dp, link_name, link_namelen))) - goto error_return; - /* - * Initialize the bmap freelist prior to calling either - * bmapi or the directory create code. - */ - XFS_BMAP_INIT(&free_list, &first_block); - - /* - * Allocate an inode for the symlink. - */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT), - 1, 0, credp, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == ENOSPC) - goto error_return; - goto error1; - } - ITRACE(ip); - - VN_HOLD(dir_vp); - xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - dp_joined_to_trans = B_TRUE; - - /* - * Also attach the dquot(s) to it, if applicable. - */ - XFS_QM_DQVOPCREATE(mp, tp, ip, udqp, gdqp); - - if (resblks) - resblks -= XFS_IALLOC_SPACE_RES(mp); - /* - * If the symlink will fit into the inode, write it inline. - */ - if (pathlen <= XFS_IFORK_DSIZE(ip)) { - xfs_idata_realloc(ip, pathlen, XFS_DATA_FORK); - memcpy(ip->i_df.if_u1.if_data, target_path, pathlen); - ip->i_d.di_size = pathlen; - - /* - * The inode was initially created in extent format. - */ - ip->i_df.if_flags &= ~(XFS_IFEXTENTS | XFS_IFBROOT); - ip->i_df.if_flags |= XFS_IFINLINE; - - ip->i_d.di_format = XFS_DINODE_FMT_LOCAL; - xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE); - - } else { - first_fsb = 0; - nmaps = SYMLINK_MAPS; - - error = xfs_bmapi(tp, ip, first_fsb, fs_blocks, - XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, - &first_block, resblks, mval, &nmaps, - &free_list, NULL); - if (error) { - goto error1; - } - - if (resblks) - resblks -= fs_blocks; - ip->i_d.di_size = pathlen; - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - - cur_chunk = target_path; - for (n = 0; n < nmaps; n++) { - d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); - byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, - BTOBB(byte_cnt), 0); - ASSERT(bp && !XFS_BUF_GETERROR(bp)); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } - pathlen -= byte_cnt; - - memcpy(XFS_BUF_PTR(bp), cur_chunk, byte_cnt); - cur_chunk += byte_cnt; - - xfs_trans_log_buf(tp, bp, 0, byte_cnt - 1); - } - } - - /* - * Create the directory entry for the symlink. - */ - error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino, - &first_block, &free_list, resblks); - if (error) - goto error1; - xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); - - /* - * Bump the in memory version number of the parent directory - * so that other processes accessing it will recognize that - * the directory has changed. - */ - dp->i_gen++; - - /* - * If this is a synchronous mount, make sure that the - * symlink transaction goes to disk before returning to - * the user. - */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } - - /* - * xfs_trans_commit normally decrements the vnode ref count - * when it unlocks the inode. Since we want to return the - * vnode to the caller, we bump the vnode ref count now. - */ - IHOLD(ip); - - error = xfs_bmap_finish(&tp, &free_list, first_block, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - /* Fall through to std_return with error = 0 or errno from - * xfs_trans_commit */ -std_return: - if (DM_EVENT_ENABLED(dir_vp->v_vfsp, XFS_BHVTOI(dir_bdp), - DM_EVENT_POSTSYMLINK)) { - (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, - dir_vp, DM_RIGHT_NULL, - error ? NULL : XFS_ITOV(ip), - DM_RIGHT_NULL, link_name, target_path, - 0, error, 0); - } - - if (!error) { - bhv_vnode_t *vp; - - ASSERT(ip); - vp = XFS_ITOV(ip); - *vpp = vp; - } - return error; - - error2: - IRELE(ip); - error1: - xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); - XFS_QM_DQRELE(mp, udqp); - XFS_QM_DQRELE(mp, gdqp); - - if (!dp_joined_to_trans && (dp != NULL)) { - xfs_iunlock(dp, XFS_ILOCK_EXCL); - } - - goto std_return; -} - - -/* - * xfs_fid2 - * - * A fid routine that takes a pointer to a previously allocated - * fid structure (like xfs_fast_fid) but uses a 64 bit inode number. - */ -STATIC int -xfs_fid2( - bhv_desc_t *bdp, - fid_t *fidp) -{ - xfs_inode_t *ip; - xfs_fid2_t *xfid; - - vn_trace_entry(BHV_TO_VNODE(bdp), __FUNCTION__, - (inst_t *)__return_address); - ASSERT(sizeof(fid_t) >= sizeof(xfs_fid2_t)); - - xfid = (xfs_fid2_t *)fidp; - ip = XFS_BHVTOI(bdp); - xfid->fid_len = sizeof(xfs_fid2_t) - sizeof(xfid->fid_len); - xfid->fid_pad = 0; - /* - * use memcpy because the inode is a long long and there's no - * assurance that xfid->fid_ino is properly aligned. - */ - memcpy(&xfid->fid_ino, &ip->i_ino, sizeof(xfid->fid_ino)); - xfid->fid_gen = ip->i_d.di_gen; - - return 0; -} - - -/* - * xfs_rwlock - */ -int -xfs_rwlock( - bhv_desc_t *bdp, - bhv_vrwlock_t locktype) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - if (VN_ISDIR(vp)) - return 1; - ip = XFS_BHVTOI(bdp); - if (locktype == VRWLOCK_WRITE) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - } else if (locktype == VRWLOCK_TRY_READ) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED); - } else if (locktype == VRWLOCK_TRY_WRITE) { - return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); - } - - return 1; -} - - -/* - * xfs_rwunlock - */ -void -xfs_rwunlock( - bhv_desc_t *bdp, - bhv_vrwlock_t locktype) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - if (VN_ISDIR(vp)) - return; - ip = XFS_BHVTOI(bdp); - if (locktype == VRWLOCK_WRITE) { - /* - * In the write case, we may have added a new entry to - * the reference cache. This might store a pointer to - * an inode to be released in this inode. If it is there, - * clear the pointer and release the inode after unlocking - * this one. - */ - xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL); - } else { - ASSERT((locktype == VRWLOCK_READ) || - (locktype == VRWLOCK_WRITE_DIRECT)); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); - } - return; -} - -STATIC int -xfs_inode_flush( - bhv_desc_t *bdp, - int flags) -{ - xfs_inode_t *ip; - xfs_mount_t *mp; - xfs_inode_log_item_t *iip; - int error = 0; - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - iip = ip->i_itemp; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - /* - * Bypass inodes which have already been cleaned by - * the inode flush clustering code inside xfs_iflush - */ - if ((ip->i_update_core == 0) && - ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) - return 0; - - if (flags & FLUSH_LOG) { - if (iip && iip->ili_last_lsn) { - xlog_t *log = mp->m_log; - xfs_lsn_t sync_lsn; - int s, log_flags = XFS_LOG_FORCE; - - s = GRANT_LOCK(log); - sync_lsn = log->l_last_sync_lsn; - GRANT_UNLOCK(log, s); - - if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) <= 0)) - return 0; - - if (flags & FLUSH_SYNC) - log_flags |= XFS_LOG_SYNC; - return xfs_log_force(mp, iip->ili_last_lsn, log_flags); - } - } - - /* - * We make this non-blocking if the inode is contended, - * return EAGAIN to indicate to the caller that they - * did not succeed. This prevents the flush path from - * blocking on inodes inside another operation right - * now, they get caught later by xfs_sync. - */ - if (flags & FLUSH_INODE) { - int flush_flags; - - if (xfs_ipincount(ip)) - return EAGAIN; - - if (flags & FLUSH_SYNC) { - xfs_ilock(ip, XFS_ILOCK_SHARED); - xfs_iflock(ip); - } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { - if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return EAGAIN; - } - } else { - return EAGAIN; - } - - if (flags & FLUSH_SYNC) - flush_flags = XFS_IFLUSH_SYNC; - else - flush_flags = XFS_IFLUSH_ASYNC; - - error = xfs_iflush(ip, flush_flags); - xfs_iunlock(ip, XFS_ILOCK_SHARED); - } - - return error; -} - -int -xfs_set_dmattrs ( - bhv_desc_t *bdp, - u_int evmask, - u_int16_t state, - cred_t *credp) -{ - xfs_inode_t *ip; - xfs_trans_t *tp; - xfs_mount_t *mp; - int error; - - if (!capable(CAP_SYS_ADMIN)) - return XFS_ERROR(EPERM); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES (mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - - ip->i_iocore.io_dmevmask = ip->i_d.di_dmevmask = evmask; - ip->i_iocore.io_dmstate = ip->i_d.di_dmstate = state; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - IHOLD(ip); - error = xfs_trans_commit(tp, 0, NULL); - - return error; -} - -STATIC int -xfs_reclaim( - bhv_desc_t *bdp) -{ - xfs_inode_t *ip; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - ip = XFS_BHVTOI(bdp); - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ASSERT(!VN_MAPPED(vp)); - - /* bad inode, get out here ASAP */ - if (VN_BAD(vp)) { - xfs_ireclaim(ip); - return 0; - } - - vn_iowait(vp); - - ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); - - /* - * Make sure the atime in the XFS inode is correct before freeing the - * Linux inode. - */ - xfs_synchronize_atime(ip); - - /* If we have nothing to flush with this inode then complete the - * teardown now, otherwise break the link between the xfs inode - * and the linux inode and clean up the xfs inode later. This - * avoids flushing the inode to disk during the delete operation - * itself. - */ - if (!ip->i_update_core && (ip->i_itemp == NULL)) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC); - } else { - xfs_mount_t *mp = ip->i_mount; - - /* Protect sync from us */ - XFS_MOUNT_ILOCK(mp); - vn_bhv_remove(VN_BHV_HEAD(vp), XFS_ITOBHV(ip)); - list_add_tail(&ip->i_reclaim, &mp->m_del_inodes); - ip->i_flags |= XFS_IRECLAIMABLE; - XFS_MOUNT_IUNLOCK(mp); - } - return 0; -} - -int -xfs_finish_reclaim( - xfs_inode_t *ip, - int locked, - int sync_mode) -{ - xfs_ihash_t *ih = ip->i_hash; - bhv_vnode_t *vp = XFS_ITOV_NULL(ip); - int error; - - if (vp && VN_BAD(vp)) - goto reclaim; - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&ih->ih_lock); - if ((ip->i_flags & XFS_IRECLAIM) || - (!(ip->i_flags & XFS_IRECLAIMABLE) && vp == NULL)) { - write_unlock(&ih->ih_lock); - if (locked) { - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - return 1; - } - ip->i_flags |= XFS_IRECLAIM; - write_unlock(&ih->ih_lock); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - if (!locked) { - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - } - - if (ip->i_update_core || - ((ip->i_itemp != NULL) && - (ip->i_itemp->ili_format.ilf_fields != 0))) { - error = xfs_iflush(ip, sync_mode); - /* - * If we hit an error, typically because of filesystem - * shutdown, we don't need to let vn_reclaim to know - * because we're gonna reclaim the inode anyway. - */ - if (error) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto reclaim; - } - xfs_iflock(ip); /* synchronize with xfs_iflush_done */ - } - - ASSERT(ip->i_update_core == 0); - ASSERT(ip->i_itemp == NULL || - ip->i_itemp->ili_format.ilf_fields == 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } else if (locked) { - /* - * We are not interested in doing an iflush if we're - * in the process of shutting down the filesystem forcibly. - * So, just reclaim the inode. - */ - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - reclaim: - xfs_ireclaim(ip); - return 0; -} - -int -xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock) -{ - int purged; - xfs_inode_t *ip, *n; - int done = 0; - - while (!done) { - purged = 0; - XFS_MOUNT_ILOCK(mp); - list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) { - if (noblock) { - if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) - continue; - if (xfs_ipincount(ip) || - !xfs_iflock_nowait(ip)) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - } - XFS_MOUNT_IUNLOCK(mp); - if (xfs_finish_reclaim(ip, noblock, - XFS_IFLUSH_DELWRI_ELSE_ASYNC)) - delay(1); - purged = 1; - break; - } - - done = !purged; - } - - XFS_MOUNT_IUNLOCK(mp); - return 0; -} - -/* - * xfs_alloc_file_space() - * This routine allocates disk space for the given file. - * - * If alloc_type == 0, this request is for an ALLOCSP type - * request which will change the file size. In this case, no - * DMAPI event will be generated by the call. A TRUNCATE event - * will be generated later by xfs_setattr. - * - * If alloc_type != 0, this request is for a RESVSP type - * request, and a DMAPI DM_EVENT_WRITE will be generated if the - * lower block boundary byte address is less than the file's - * length. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_alloc_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int alloc_type, - int attr_flags) -{ - xfs_mount_t *mp = ip->i_mount; - xfs_off_t count; - xfs_filblks_t allocated_fsb; - xfs_filblks_t allocatesize_fsb; - xfs_extlen_t extsz, temp; - xfs_fileoff_t startoffset_fsb; - xfs_fsblock_t firstfsb; - int nimaps; - int bmapi_flag; - int quota_flag; - int rt; - xfs_trans_t *tp; - xfs_bmbt_irec_t imaps[1], *imapp; - xfs_bmap_free_t free_list; - uint qblocks, resblks, resrtextents; - int committed; - int error; - - vn_trace_entry(XFS_ITOV(ip), __FUNCTION__, (inst_t *)__return_address); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - rt = XFS_IS_REALTIME_INODE(ip); - if (unlikely(rt)) { - if (!(extsz = ip->i_d.di_extsize)) - extsz = mp->m_sb.sb_rextsize; - } else { - extsz = ip->i_d.di_extsize; - } - - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) - return error; - - if (len <= 0) - return XFS_ERROR(EINVAL); - - count = len; - error = 0; - imapp = &imaps[0]; - nimaps = 1; - bmapi_flag = XFS_BMAPI_WRITE | (alloc_type ? XFS_BMAPI_PREALLOC : 0); - startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); - - /* Generate a DMAPI event if needed. */ - if (alloc_type != 0 && offset < ip->i_d.di_size && - (attr_flags&ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { - xfs_off_t end_dmi_offset; - - end_dmi_offset = offset+len; - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), - offset, end_dmi_offset - offset, - 0, NULL); - if (error) - return error; - } - - /* - * Allocate file space until done or until there is an error - */ -retry: - while (allocatesize_fsb && !error) { - xfs_fileoff_t s, e; - - /* - * Determine space reservations for data/realtime. - */ - if (unlikely(extsz)) { - s = startoffset_fsb; - do_div(s, extsz); - s *= extsz; - e = startoffset_fsb + allocatesize_fsb; - if ((temp = do_mod(startoffset_fsb, extsz))) - e += temp; - if ((temp = do_mod(e, extsz))) - e += extsz - temp; - } else { - s = 0; - e = allocatesize_fsb; - } - - if (unlikely(rt)) { - resrtextents = qblocks = (uint)(e - s); - resrtextents /= mp->m_sb.sb_rextsize; - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - quota_flag = XFS_QMOPT_RES_RTBLKS; - } else { - resrtextents = 0; - resblks = qblocks = \ - XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); - quota_flag = XFS_QMOPT_RES_REGBLKS; - } - - /* - * Allocate and setup the transaction. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, resblks, - XFS_WRITE_LOG_RES(mp), resrtextents, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - /* - * Check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, tp, ip, - qblocks, 0, quota_flag); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - /* - * Issue the xfs_bmapi() call to allocate the blocks - */ - XFS_BMAP_INIT(&free_list, &firstfsb); - error = XFS_BMAPI(mp, tp, &ip->i_iocore, startoffset_fsb, - allocatesize_fsb, bmapi_flag, - &firstfsb, 0, imapp, &nimaps, - &free_list, NULL); - if (error) { - goto error0; - } - - /* - * Complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { - break; - } - - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = XFS_ERROR(ENOSPC); - break; - } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; - } -dmapi_enospc_check: - if (error == ENOSPC && (attr_flags&ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_NOSPACE)) { - - error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, - XFS_ITOV(ip), DM_RIGHT_NULL, - XFS_ITOV(ip), DM_RIGHT_NULL, - NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ - if (error == 0) - goto retry; /* Maybe DMAPI app. has made space */ - /* else fall through with error from XFS_SEND_DATA */ - } - - return error; - -error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ - xfs_bmap_cancel(&free_list); - XFS_TRANS_UNRESERVE_QUOTA_NBLKS(mp, tp, ip, qblocks, 0, quota_flag); - -error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto dmapi_enospc_check; -} - -/* - * Zero file bytes between startoff and endoff inclusive. - * The iolock is held exclusive and no blocks are buffered. - */ -STATIC int -xfs_zero_remaining_bytes( - xfs_inode_t *ip, - xfs_off_t startoff, - xfs_off_t endoff) -{ - xfs_bmbt_irec_t imap; - xfs_fileoff_t offset_fsb; - xfs_off_t lastoffset; - xfs_off_t offset; - xfs_buf_t *bp; - xfs_mount_t *mp = ip->i_mount; - int nimap; - int error = 0; - - bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, - ip->i_d.di_flags & XFS_DIFLAG_REALTIME ? - mp->m_rtdev_targp : mp->m_ddev_targp); - - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { - offset_fsb = XFS_B_TO_FSBT(mp, offset); - nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, offset_fsb, 1, 0, - NULL, 0, &imap, &nimap, NULL, NULL); - if (error || nimap < 1) - break; - ASSERT(imap.br_blockcount >= 1); - ASSERT(imap.br_startoff == offset_fsb); - lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; - if (lastoffset > endoff) - lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) - continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); - xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", - mp, bp, XFS_BUF_ADDR(bp)); - break; - } - memset(XFS_BUF_PTR(bp) + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); - xfsbdstrat(mp, bp); - if ((error = xfs_iowait(bp))) { - xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", - mp, bp, XFS_BUF_ADDR(bp)); - break; - } - } - xfs_buf_free(bp); - return error; -} - -/* - * xfs_free_file_space() - * This routine frees disk space for the given file. - * - * This routine is only called by xfs_change_file_space - * for an UNRESVSP type call. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -STATIC int -xfs_free_file_space( - xfs_inode_t *ip, - xfs_off_t offset, - xfs_off_t len, - int attr_flags) -{ - bhv_vnode_t *vp; - int committed; - int done; - xfs_off_t end_dmi_offset; - xfs_fileoff_t endoffset_fsb; - int error; - xfs_fsblock_t firstfsb; - xfs_bmap_free_t free_list; - xfs_off_t ilen; - xfs_bmbt_irec_t imap; - xfs_off_t ioffset; - xfs_extlen_t mod=0; - xfs_mount_t *mp; - int nimap; - uint resblks; - int rounding; - int rt; - xfs_fileoff_t startoffset_fsb; - xfs_trans_t *tp; - int need_iolock = 1; - - vp = XFS_ITOV(ip); - mp = ip->i_mount; - - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - if ((error = XFS_QM_DQATTACH(mp, ip, 0))) - return error; - - error = 0; - if (len <= 0) /* if nothing being freed */ - return error; - rt = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME); - startoffset_fsb = XFS_B_TO_FSB(mp, offset); - end_dmi_offset = offset + len; - endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset); - - if (offset < ip->i_d.di_size && - (attr_flags & ATTR_DMI) == 0 && - DM_EVENT_ENABLED(XFS_MTOVFS(mp), ip, DM_EVENT_WRITE)) { - if (end_dmi_offset > ip->i_d.di_size) - end_dmi_offset = ip->i_d.di_size; - error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, - offset, end_dmi_offset - offset, - AT_DELAY_FLAG(attr_flags), NULL); - if (error) - return error; - } - - if (attr_flags & ATTR_NOLOCK) - need_iolock = 0; - if (need_iolock) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); - vn_iowait(vp); /* wait for the completion of any pending DIOs */ - } - - rounding = MAX((__uint8_t)(1 << mp->m_sb.sb_blocklog), - (__uint8_t)NBPP); - ilen = len + (offset & (rounding - 1)); - ioffset = offset & ~(rounding - 1); - if (ilen & (rounding - 1)) - ilen = (ilen + rounding) & ~(rounding - 1); - - if (VN_CACHED(vp) != 0) { - xfs_inval_cached_trace(&ip->i_iocore, ioffset, -1, - ctooff(offtoct(ioffset)), -1); - bhv_vop_flushinval_pages(vp, ctooff(offtoct(ioffset)), - -1, FI_REMAPF_LOCKED); - } - - /* - * Need to zero the stuff we're not freeing, on disk. - * If its a realtime file & can't use unwritten extents then we - * actually need to zero the extent edges. Otherwise xfs_bunmapi - * will take care of it for us. - */ - if (rt && !XFS_SB_VERSION_HASEXTFLGBIT(&mp->m_sb)) { - nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, startoffset_fsb, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); - if (mod) - startoffset_fsb += mp->m_sb.sb_rextsize - mod; - } - nimap = 1; - error = XFS_BMAPI(mp, NULL, &ip->i_iocore, endoffset_fsb - 1, - 1, 0, NULL, 0, &imap, &nimap, NULL, NULL); - if (error) - goto out_unlock_iolock; - ASSERT(nimap == 0 || nimap == 1); - if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - mod++; - if (mod && (mod != mp->m_sb.sb_rextsize)) - endoffset_fsb -= mod; - } - } - if ((done = (endoffset_fsb <= startoffset_fsb))) - /* - * One contiguous piece to clear - */ - error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1); - else { - /* - * Some full blocks, possibly two pieces to clear - */ - if (offset < XFS_FSB_TO_B(mp, startoffset_fsb)) - error = xfs_zero_remaining_bytes(ip, offset, - XFS_FSB_TO_B(mp, startoffset_fsb) - 1); - if (!error && - XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len) - error = xfs_zero_remaining_bytes(ip, - XFS_FSB_TO_B(mp, endoffset_fsb), - offset + len - 1); - } - - /* - * free file space until done or until there is an error - */ - resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); - while (!error && !done) { - - /* - * allocate and setup the transaction - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); - error = xfs_trans_reserve(tp, - resblks, - XFS_WRITE_LOG_RES(mp), - 0, - XFS_TRANS_PERM_LOG_RES, - XFS_WRITE_LOG_COUNT); - - /* - * check for running out of space - */ - if (error) { - /* - * Free the transaction structure. - */ - ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); - break; - } - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = XFS_TRANS_RESERVE_QUOTA(mp, tp, - ip->i_udquot, ip->i_gdquot, resblks, 0, - XFS_QMOPT_RES_REGBLKS); - if (error) - goto error1; - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - /* - * issue the bunmapi() call to free the blocks - */ - XFS_BMAP_INIT(&free_list, &firstfsb); - error = XFS_BUNMAPI(mp, tp, &ip->i_iocore, startoffset_fsb, - endoffset_fsb - startoffset_fsb, - 0, 2, &firstfsb, &free_list, NULL, &done); - if (error) { - goto error0; - } - - /* - * complete the transaction - */ - error = xfs_bmap_finish(&tp, &free_list, firstfsb, &committed); - if (error) { - goto error0; - } - - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - } - - out_unlock_iolock: - if (need_iolock) - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - return error; - - error0: - xfs_bmap_cancel(&free_list); - error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) : - XFS_ILOCK_EXCL); - return error; -} - -/* - * xfs_change_file_space() - * This routine allocates or frees disk space for the given file. - * The user specified parameters are checked for alignment and size - * limitations. - * - * RETURNS: - * 0 on success - * errno on error - * - */ -int -xfs_change_file_space( - bhv_desc_t *bdp, - int cmd, - xfs_flock64_t *bf, - xfs_off_t offset, - cred_t *credp, - int attr_flags) -{ - int clrprealloc; - int error; - xfs_fsize_t fsize; - xfs_inode_t *ip; - xfs_mount_t *mp; - int setprealloc; - xfs_off_t startoffset; - xfs_off_t llen; - xfs_trans_t *tp; - bhv_vattr_t va; - bhv_vnode_t *vp; - - vp = BHV_TO_VNODE(bdp); - vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address); - - ip = XFS_BHVTOI(bdp); - mp = ip->i_mount; - - /* - * must be a regular file and have write permission - */ - if (!VN_ISREG(vp)) - return XFS_ERROR(EINVAL); - - xfs_ilock(ip, XFS_ILOCK_SHARED); - - if ((error = xfs_iaccess(ip, S_IWUSR, credp))) { - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return error; - } - - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - switch (bf->l_whence) { - case 0: /*SEEK_SET*/ - break; - case 1: /*SEEK_CUR*/ - bf->l_start += offset; - break; - case 2: /*SEEK_END*/ - bf->l_start += ip->i_d.di_size; - break; - default: - return XFS_ERROR(EINVAL); - } - - llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len; - - if ( (bf->l_start < 0) - || (bf->l_start > XFS_MAXIOFFSET(mp)) - || (bf->l_start + llen < 0) - || (bf->l_start + llen > XFS_MAXIOFFSET(mp))) - return XFS_ERROR(EINVAL); - - bf->l_whence = 0; - - startoffset = bf->l_start; - fsize = ip->i_d.di_size; - - /* - * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve - * file space. - * These calls do NOT zero the data space allocated to the file, - * nor do they change the file size. - * - * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file - * space. - * These calls cause the new file data to be zeroed and the file - * size to be changed. - */ - setprealloc = clrprealloc = 0; - - switch (cmd) { - case XFS_IOC_RESVSP: - case XFS_IOC_RESVSP64: - error = xfs_alloc_file_space(ip, startoffset, bf->l_len, - 1, attr_flags); - if (error) - return error; - setprealloc = 1; - break; - - case XFS_IOC_UNRESVSP: - case XFS_IOC_UNRESVSP64: - if ((error = xfs_free_file_space(ip, startoffset, bf->l_len, - attr_flags))) - return error; - break; - - case XFS_IOC_ALLOCSP: - case XFS_IOC_ALLOCSP64: - case XFS_IOC_FREESP: - case XFS_IOC_FREESP64: - if (startoffset > fsize) { - error = xfs_alloc_file_space(ip, fsize, - startoffset - fsize, 0, attr_flags); - if (error) - break; - } - - va.va_mask = XFS_AT_SIZE; - va.va_size = startoffset; - - error = xfs_setattr(bdp, &va, attr_flags, credp); - - if (error) - return error; - - clrprealloc = 1; - break; - - default: - ASSERT(0); - return XFS_ERROR(EINVAL); - } - - /* - * update the inode timestamp, mode, and prealloc flag bits - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); - - if ((error = xfs_trans_reserve(tp, 0, XFS_WRITEID_LOG_RES(mp), - 0, 0, 0))) { - /* ASSERT(0); */ - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_ihold(tp, ip); - - if ((attr_flags & ATTR_DMI) == 0) { - ip->i_d.di_mode &= ~S_ISUID; - - /* - * Note that we don't have to worry about mandatory - * file locking being disabled here because we only - * clear the S_ISGID bit if the Group execute bit is - * on, but if it was on then mandatory locking wouldn't - * have been enabled. - */ - if (ip->i_d.di_mode & S_IXGRP) - ip->i_d.di_mode &= ~S_ISGID; - - xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - } - if (setprealloc) - ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC; - else if (clrprealloc) - ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - - error = xfs_trans_commit(tp, 0, NULL); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - return error; -} - -bhv_vnodeops_t xfs_vnodeops = { - BHV_IDENTITY_INIT(VN_BHV_XFS,VNODE_POSITION_XFS), - .vop_open = xfs_open, - .vop_close = xfs_close, - .vop_read = xfs_read, -#ifdef HAVE_SENDFILE - .vop_sendfile = xfs_sendfile, -#endif -#ifdef HAVE_SPLICE - .vop_splice_read = xfs_splice_read, - .vop_splice_write = xfs_splice_write, -#endif - .vop_write = xfs_write, - .vop_ioctl = xfs_ioctl, - .vop_getattr = xfs_getattr, - .vop_setattr = xfs_setattr, - .vop_access = xfs_access, - .vop_lookup = xfs_lookup, - .vop_create = xfs_create, - .vop_remove = xfs_remove, - .vop_link = xfs_link, - .vop_rename = xfs_rename, - .vop_mkdir = xfs_mkdir, - .vop_rmdir = xfs_rmdir, - .vop_readdir = xfs_readdir, - .vop_symlink = xfs_symlink, - .vop_readlink = xfs_readlink, - .vop_fsync = xfs_fsync, - .vop_inactive = xfs_inactive, - .vop_fid2 = xfs_fid2, - .vop_rwlock = xfs_rwlock, - .vop_rwunlock = xfs_rwunlock, - .vop_bmap = xfs_bmap, - .vop_reclaim = xfs_reclaim, - .vop_attr_get = xfs_attr_get, - .vop_attr_set = xfs_attr_set, - .vop_attr_remove = xfs_attr_remove, - .vop_attr_list = xfs_attr_list, - .vop_link_removed = (vop_link_removed_t)fs_noval, - .vop_vnode_change = (vop_vnode_change_t)fs_noval, - .vop_tosspages = fs_tosspages, - .vop_flushinval_pages = fs_flushinval_pages, - .vop_flush_pages = fs_flush_pages, - .vop_release = xfs_release, - .vop_iflush = xfs_inode_flush, -}; diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c new file mode 100644 index 00000000000..78ed92a46fd --- /dev/null +++ b/fs/xfs/xfs_xattr.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2008 Christoph Hellwig. + * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" +#include "xfs_da_format.h" +#include "xfs_inode.h" +#include "xfs_attr.h" +#include "xfs_attr_leaf.h" +#include "xfs_acl.h" + +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + + +static int +xfs_xattr_get(struct dentry *dentry, const char *name, + void *value, size_t size, int xflags) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + int error, asize = size; + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (!size) { + xflags |= ATTR_KERNOVAL; + value = NULL; + } + + error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); + if (error) + return error; + return asize; +} + +static int +xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags, int xflags) +{ + struct xfs_inode *ip = XFS_I(dentry->d_inode); + + if (strcmp(name, "") == 0) + return -EINVAL; + + /* Convert Linux syscall to XFS internal ATTR flags */ + if (flags & XATTR_CREATE) + xflags |= ATTR_CREATE; + if (flags & XATTR_REPLACE) + xflags |= ATTR_REPLACE; + + if (!value) + return -xfs_attr_remove(ip, (unsigned char *)name, xflags); + return -xfs_attr_set(ip, (unsigned char *)name, + (void *)value, size, xflags); +} + +static const struct xattr_handler xfs_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = 0, /* no flags implies user namespace */ + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = ATTR_ROOT, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +static const struct xattr_handler xfs_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = ATTR_SECURE, + .get = xfs_xattr_get, + .set = xfs_xattr_set, +}; + +const struct xattr_handler *xfs_xattr_handlers[] = { + &xfs_xattr_user_handler, + &xfs_xattr_trusted_handler, + &xfs_xattr_security_handler, +#ifdef CONFIG_XFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + NULL +}; + +static unsigned int xfs_xattr_prefix_len(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return sizeof("security"); + else if (flags & XFS_ATTR_ROOT) + return sizeof("trusted"); + else + return sizeof("user"); +} + +static const char *xfs_xattr_prefix(int flags) +{ + if (flags & XFS_ATTR_SECURE) + return xfs_xattr_security_handler.prefix; + else if (flags & XFS_ATTR_ROOT) + return xfs_xattr_trusted_handler.prefix; + else + return xfs_xattr_user_handler.prefix; +} + +static int +xfs_xattr_put_listent( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + unsigned int prefix_len = xfs_xattr_prefix_len(flags); + char *offset; + int arraytop; + + ASSERT(context->count >= 0); + + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) + return 0; + + arraytop = context->count + prefix_len + namelen + 1; + if (arraytop > context->firstu) { + context->count = -1; /* insufficient space */ + return 1; + } + offset = (char *)context->alist + context->count; + strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + offset += prefix_len; + strncpy(offset, (char *)name, namelen); /* real name */ + offset += namelen; + *offset = '\0'; + context->count += prefix_len + namelen + 1; + return 0; +} + +static int +xfs_xattr_put_listent_sizes( + struct xfs_attr_list_context *context, + int flags, + unsigned char *name, + int namelen, + int valuelen, + unsigned char *value) +{ + context->count += xfs_xattr_prefix_len(flags) + namelen + 1; + return 0; +} + +static int +list_one_attr(const char *name, const size_t len, void *data, + size_t size, ssize_t *result) +{ + char *p = data + *result; + + *result += len; + if (!size) + return 0; + if (*result > size) + return -ERANGE; + + strcpy(p, name); + return 0; +} + +ssize_t +xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) +{ + struct xfs_attr_list_context context; + struct attrlist_cursor_kern cursor = { 0 }; + struct inode *inode = dentry->d_inode; + int error; + + /* + * First read the regular on-disk attributes. + */ + memset(&context, 0, sizeof(context)); + context.dp = XFS_I(inode); + context.cursor = &cursor; + context.resynch = 1; + context.alist = data; + context.bufsize = size; + context.firstu = context.bufsize; + + if (size) + context.put_listent = xfs_xattr_put_listent; + else + context.put_listent = xfs_xattr_put_listent_sizes; + + xfs_attr_list_int(&context); + if (context.count < 0) + return -ERANGE; + + /* + * Then add the two synthetic ACL attributes. + */ + if (posix_acl_access_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_ACCESS, + strlen(POSIX_ACL_XATTR_ACCESS) + 1, + data, size, &context.count); + if (error) + return error; + } + + if (posix_acl_default_exists(inode)) { + error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, + strlen(POSIX_ACL_XATTR_DEFAULT) + 1, + data, size, &context.count); + if (error) + return error; + } + + return context.count; +} |
